835 lines
28 KiB
Diff
835 lines
28 KiB
Diff
From 23d48c5fc7aa889dc7798f9c64acd43d9cb34683 Mon Sep 17 00:00:00 2001
|
|
From: Christian Persch <chpe@gnome.org>
|
|
Date: Sun, 12 Feb 2012 21:20:33 +0100
|
|
Subject: [PATCH] regex: Use glib for unicode data
|
|
|
|
Use g_unichar_type() and g_unichar_get_script() instead of pcre tables.
|
|
---
|
|
glib/pcre/pcre_compile.c | 26 +++---
|
|
glib/pcre/pcre_dfa_exec.c | 96 ++++++++--------
|
|
glib/pcre/pcre_exec.c | 26 +++---
|
|
glib/pcre/pcre_internal.h | 11 +--
|
|
glib/pcre/pcre_tables.c | 16 +++
|
|
glib/pcre/pcre_xclass.c | 24 ++--
|
|
glib/pcre/ucp.h | 265 +++++++++++++++++++++++----------------------
|
|
7 files changed, 239 insertions(+), 225 deletions(-)
|
|
|
|
diff --git a/glib/pcre/pcre_compile.c b/glib/pcre/pcre_compile.c
|
|
index 21bef80..a6c84e1 100644
|
|
--- a/glib/pcre/pcre_compile.c
|
|
+++ b/glib/pcre/pcre_compile.c
|
|
@@ -2920,43 +2920,43 @@ Returns: TRUE if auto-possessifying is OK
|
|
static BOOL
|
|
check_char_prop(int c, int ptype, int pdata, BOOL negated)
|
|
{
|
|
-const ucd_record *prop = GET_UCD(c);
|
|
+const pcre_uint8 chartype = UCD_CHARTYPE(c);
|
|
switch(ptype)
|
|
{
|
|
case PT_LAMP:
|
|
- return (prop->chartype == ucp_Lu ||
|
|
- prop->chartype == ucp_Ll ||
|
|
- prop->chartype == ucp_Lt) == negated;
|
|
+ return (chartype == ucp_Lu ||
|
|
+ chartype == ucp_Ll ||
|
|
+ chartype == ucp_Lt) == negated;
|
|
|
|
case PT_GC:
|
|
- return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
|
|
+ return (pdata == PRIV(ucp_gentype)[chartype]) == negated;
|
|
|
|
case PT_PC:
|
|
- return (pdata == prop->chartype) == negated;
|
|
+ return (pdata == chartype) == negated;
|
|
|
|
case PT_SC:
|
|
- return (pdata == prop->script) == negated;
|
|
+ return (pdata == UCD_SCRIPT(c)) == negated;
|
|
|
|
/* These are specials */
|
|
|
|
case PT_ALNUM:
|
|
- return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
|
|
+ return (PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N) == negated;
|
|
|
|
case PT_SPACE: /* Perl space */
|
|
- return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ return (PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
|
|
== negated;
|
|
|
|
case PT_PXSPACE: /* POSIX space */
|
|
- return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ return (PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
|
c == CHAR_FF || c == CHAR_CR)
|
|
== negated;
|
|
|
|
case PT_WORD:
|
|
- return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
|
+ return (PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
|
|
c == CHAR_UNDERSCORE) == negated;
|
|
}
|
|
return FALSE;
|
|
diff --git a/glib/pcre/pcre_dfa_exec.c b/glib/pcre/pcre_dfa_exec.c
|
|
index 9565d46..3f913ce 100644
|
|
--- a/glib/pcre/pcre_dfa_exec.c
|
|
+++ b/glib/pcre/pcre_dfa_exec.c
|
|
@@ -1060,7 +1060,7 @@ for (;;)
|
|
if (clen > 0)
|
|
{
|
|
BOOL OK;
|
|
- const ucd_record * prop = GET_UCD(c);
|
|
+ const pcre_uint8 chartype = UCD_CHARTYPE(c);
|
|
switch(code[1])
|
|
{
|
|
case PT_ANY:
|
|
@@ -1068,43 +1068,43 @@ for (;;)
|
|
break;
|
|
|
|
case PT_LAMP:
|
|
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
|
- prop->chartype == ucp_Lt;
|
|
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
|
|
+ chartype == ucp_Lt;
|
|
break;
|
|
|
|
case PT_GC:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
|
|
+ OK = PRIV(ucp_gentype)[chartype] == code[2];
|
|
break;
|
|
|
|
case PT_PC:
|
|
- OK = prop->chartype == code[2];
|
|
+ OK = chartype == code[2];
|
|
break;
|
|
|
|
case PT_SC:
|
|
- OK = prop->script == code[2];
|
|
+ OK = UCD_SCRIPT(c) == code[2];
|
|
break;
|
|
|
|
/* These are specials for combination cases. */
|
|
|
|
case PT_ALNUM:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N;
|
|
break;
|
|
|
|
case PT_SPACE: /* Perl space */
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
|
break;
|
|
|
|
case PT_PXSPACE: /* POSIX space */
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
|
c == CHAR_FF || c == CHAR_CR;
|
|
break;
|
|
|
|
case PT_WORD:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
|
|
c == CHAR_UNDERSCORE;
|
|
break;
|
|
|
|
@@ -1294,7 +1294,7 @@ for (;;)
|
|
if (clen > 0)
|
|
{
|
|
BOOL OK;
|
|
- const ucd_record * prop = GET_UCD(c);
|
|
+ const pcre_uint8 chartype = UCD_CHARTYPE(c);
|
|
switch(code[2])
|
|
{
|
|
case PT_ANY:
|
|
@@ -1302,43 +1302,43 @@ for (;;)
|
|
break;
|
|
|
|
case PT_LAMP:
|
|
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
|
- prop->chartype == ucp_Lt;
|
|
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
|
|
+ chartype == ucp_Lt;
|
|
break;
|
|
|
|
case PT_GC:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
|
|
+ OK = PRIV(ucp_gentype)[chartype] == code[3];
|
|
break;
|
|
|
|
case PT_PC:
|
|
- OK = prop->chartype == code[3];
|
|
+ OK = chartype == code[3];
|
|
break;
|
|
|
|
case PT_SC:
|
|
- OK = prop->script == code[3];
|
|
+ OK = UCD_SCRIPT(c) == code[3];
|
|
break;
|
|
|
|
/* These are specials for combination cases. */
|
|
|
|
case PT_ALNUM:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N;
|
|
break;
|
|
|
|
case PT_SPACE: /* Perl space */
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
|
break;
|
|
|
|
case PT_PXSPACE: /* POSIX space */
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
|
c == CHAR_FF || c == CHAR_CR;
|
|
break;
|
|
|
|
case PT_WORD:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
|
|
c == CHAR_UNDERSCORE;
|
|
break;
|
|
|
|
@@ -1541,7 +1541,7 @@ for (;;)
|
|
if (clen > 0)
|
|
{
|
|
BOOL OK;
|
|
- const ucd_record * prop = GET_UCD(c);
|
|
+ const pcre_uint8 chartype = UCD_CHARTYPE(c);
|
|
switch(code[2])
|
|
{
|
|
case PT_ANY:
|
|
@@ -1549,43 +1549,43 @@ for (;;)
|
|
break;
|
|
|
|
case PT_LAMP:
|
|
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
|
- prop->chartype == ucp_Lt;
|
|
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
|
|
+ chartype == ucp_Lt;
|
|
break;
|
|
|
|
case PT_GC:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
|
|
+ OK = PRIV(ucp_gentype)[chartype] == code[3];
|
|
break;
|
|
|
|
case PT_PC:
|
|
- OK = prop->chartype == code[3];
|
|
+ OK = chartype == code[3];
|
|
break;
|
|
|
|
case PT_SC:
|
|
- OK = prop->script == code[3];
|
|
+ OK = UCD_SCRIPT(c) == code[3];
|
|
break;
|
|
|
|
/* These are specials for combination cases. */
|
|
|
|
case PT_ALNUM:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N;
|
|
break;
|
|
|
|
case PT_SPACE: /* Perl space */
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
|
break;
|
|
|
|
case PT_PXSPACE: /* POSIX space */
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
|
c == CHAR_FF || c == CHAR_CR;
|
|
break;
|
|
|
|
case PT_WORD:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
|
|
c == CHAR_UNDERSCORE;
|
|
break;
|
|
|
|
@@ -1813,7 +1813,7 @@ for (;;)
|
|
if (clen > 0)
|
|
{
|
|
BOOL OK;
|
|
- const ucd_record * prop = GET_UCD(c);
|
|
+ const pcre_uint8 chartype = UCD_CHARTYPE(c);
|
|
switch(code[1 + IMM2_SIZE + 1])
|
|
{
|
|
case PT_ANY:
|
|
@@ -1821,43 +1821,43 @@ for (;;)
|
|
break;
|
|
|
|
case PT_LAMP:
|
|
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
|
- prop->chartype == ucp_Lt;
|
|
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
|
|
+ chartype == ucp_Lt;
|
|
break;
|
|
|
|
case PT_GC:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
|
|
+ OK = PRIV(ucp_gentype)[chartype] == code[1 + IMM2_SIZE + 2];
|
|
break;
|
|
|
|
case PT_PC:
|
|
- OK = prop->chartype == code[1 + IMM2_SIZE + 2];
|
|
+ OK = chartype == code[1 + IMM2_SIZE + 2];
|
|
break;
|
|
|
|
case PT_SC:
|
|
- OK = prop->script == code[1 + IMM2_SIZE + 2];
|
|
+ OK = UCD_SCRIPT(c) == code[1 + IMM2_SIZE + 2];
|
|
break;
|
|
|
|
/* These are specials for combination cases. */
|
|
|
|
case PT_ALNUM:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N;
|
|
break;
|
|
|
|
case PT_SPACE: /* Perl space */
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
|
break;
|
|
|
|
case PT_PXSPACE: /* POSIX space */
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
|
c == CHAR_FF || c == CHAR_CR;
|
|
break;
|
|
|
|
case PT_WORD:
|
|
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
|
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
|
|
c == CHAR_UNDERSCORE;
|
|
break;
|
|
|
|
diff --git a/glib/pcre/pcre_exec.c b/glib/pcre/pcre_exec.c
|
|
index 830b8b5..c89a3f9 100644
|
|
--- a/glib/pcre/pcre_exec.c
|
|
+++ b/glib/pcre/pcre_exec.c
|
|
@@ -2565,7 +2565,7 @@ for (;;)
|
|
}
|
|
GETCHARINCTEST(c, eptr);
|
|
{
|
|
- const ucd_record *prop = GET_UCD(c);
|
|
+ const pcre_uint8 chartype = UCD_CHARTYPE(c);
|
|
|
|
switch(ecode[1])
|
|
{
|
|
@@ -2574,44 +2574,44 @@ for (;;)
|
|
break;
|
|
|
|
case PT_LAMP:
|
|
- if ((prop->chartype == ucp_Lu ||
|
|
- prop->chartype == ucp_Ll ||
|
|
- prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
|
|
+ if ((chartype == ucp_Lu ||
|
|
+ chartype == ucp_Ll ||
|
|
+ chartype == ucp_Lt) == (op == OP_NOTPROP))
|
|
RRETURN(MATCH_NOMATCH);
|
|
break;
|
|
|
|
case PT_GC:
|
|
- if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
|
|
+ if ((ecode[2] != PRIV(ucp_gentype)[chartype]) == (op == OP_PROP))
|
|
RRETURN(MATCH_NOMATCH);
|
|
break;
|
|
|
|
case PT_PC:
|
|
- if ((ecode[2] != prop->chartype) == (op == OP_PROP))
|
|
+ if ((ecode[2] != chartype) == (op == OP_PROP))
|
|
RRETURN(MATCH_NOMATCH);
|
|
break;
|
|
|
|
case PT_SC:
|
|
- if ((ecode[2] != prop->script) == (op == OP_PROP))
|
|
+ if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP))
|
|
RRETURN(MATCH_NOMATCH);
|
|
break;
|
|
|
|
/* These are specials */
|
|
|
|
case PT_ALNUM:
|
|
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
|
|
+ if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N) == (op == OP_NOTPROP))
|
|
RRETURN(MATCH_NOMATCH);
|
|
break;
|
|
|
|
case PT_SPACE: /* Perl space */
|
|
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
|
|
== (op == OP_NOTPROP))
|
|
RRETURN(MATCH_NOMATCH);
|
|
break;
|
|
|
|
case PT_PXSPACE: /* POSIX space */
|
|
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
|
c == CHAR_FF || c == CHAR_CR)
|
|
== (op == OP_NOTPROP))
|
|
@@ -2619,8 +2619,8 @@ for (;;)
|
|
break;
|
|
|
|
case PT_WORD:
|
|
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
|
+ if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
|
|
c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
|
|
RRETURN(MATCH_NOMATCH);
|
|
break;
|
|
diff --git a/glib/pcre/pcre_internal.h b/glib/pcre/pcre_internal.h
|
|
index 181c312..234af1b 100644
|
|
--- a/glib/pcre/pcre_internal.h
|
|
+++ b/glib/pcre/pcre_internal.h
|
|
@@ -2329,15 +2329,12 @@ extern const int PRIV(ucp_typerange)[];
|
|
#ifdef SUPPORT_UCP
|
|
/* UCD access macros */
|
|
|
|
-#define UCD_BLOCK_SIZE 128
|
|
-#define GET_UCD(ch) (PRIV(ucd_records) + \
|
|
- PRIV(ucd_stage2)[PRIV(ucd_stage1)[(ch) / UCD_BLOCK_SIZE] * \
|
|
- UCD_BLOCK_SIZE + (ch) % UCD_BLOCK_SIZE])
|
|
+unsigned int _pcre_ucp_othercase(const unsigned int c);
|
|
|
|
-#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
|
|
-#define UCD_SCRIPT(ch) GET_UCD(ch)->script
|
|
+#define UCD_CHARTYPE(ch) (pcre_uint8)g_unichar_type((gunichar)(ch))
|
|
+#define UCD_SCRIPT(ch) (pcre_uint8)g_unichar_get_script((gunichar)(ch))
|
|
#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
|
|
-#define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)
|
|
+#define UCD_OTHERCASE(ch) (_pcre_ucp_othercase(ch))
|
|
|
|
#endif /* SUPPORT_UCP */
|
|
|
|
diff --git a/glib/pcre/pcre_tables.c b/glib/pcre/pcre_tables.c
|
|
index 7ac2d89..e401974 100644
|
|
--- a/glib/pcre/pcre_tables.c
|
|
+++ b/glib/pcre/pcre_tables.c
|
|
@@ -584,6 +584,22 @@ const ucp_type_table PRIV(utt)[] = {
|
|
|
|
const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
|
|
|
+unsigned int
|
|
+_pcre_ucp_othercase(const unsigned int c)
|
|
+{
|
|
+ int other_case = NOTACHAR;
|
|
+
|
|
+ if (g_unichar_islower(c))
|
|
+ other_case = g_unichar_toupper(c);
|
|
+ else if (g_unichar_isupper(c))
|
|
+ other_case = g_unichar_tolower(c);
|
|
+
|
|
+ if (other_case == c)
|
|
+ other_case = NOTACHAR;
|
|
+
|
|
+ return other_case;
|
|
+}
|
|
+
|
|
#endif /* SUPPORT_UTF */
|
|
|
|
/* End of pcre_tables.c */
|
|
diff --git a/glib/pcre/pcre_xclass.c b/glib/pcre/pcre_xclass.c
|
|
index dca7a39..e5a55d7 100644
|
|
--- a/glib/pcre/pcre_xclass.c
|
|
+++ b/glib/pcre/pcre_xclass.c
|
|
@@ -127,7 +127,7 @@ while ((t = *data++) != XCL_END)
|
|
#ifdef SUPPORT_UCP
|
|
else /* XCL_PROP & XCL_NOTPROP */
|
|
{
|
|
- const ucd_record *prop = GET_UCD(c);
|
|
+ const pcre_uint8 chartype = UCD_CHARTYPE(c);
|
|
|
|
switch(*data)
|
|
{
|
|
@@ -136,46 +136,46 @@ while ((t = *data++) != XCL_END)
|
|
break;
|
|
|
|
case PT_LAMP:
|
|
- if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
|
- prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
|
|
+ if ((chartype == ucp_Lu || chartype == ucp_Ll ||
|
|
+ chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
|
|
break;
|
|
|
|
case PT_GC:
|
|
- if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == (t == XCL_PROP))
|
|
+ if ((data[1] == PRIV(ucp_gentype)[chartype]) == (t == XCL_PROP))
|
|
return !negated;
|
|
break;
|
|
|
|
case PT_PC:
|
|
- if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated;
|
|
+ if ((data[1] == chartype) == (t == XCL_PROP)) return !negated;
|
|
break;
|
|
|
|
case PT_SC:
|
|
- if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated;
|
|
+ if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated;
|
|
break;
|
|
|
|
case PT_ALNUM:
|
|
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (t == XCL_PROP))
|
|
+ if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N) == (t == XCL_PROP))
|
|
return !negated;
|
|
break;
|
|
|
|
case PT_SPACE: /* Perl space */
|
|
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
|
|
== (t == XCL_PROP))
|
|
return !negated;
|
|
break;
|
|
|
|
case PT_PXSPACE: /* POSIX space */
|
|
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
|
+ if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
|
|
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
|
c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
|
|
return !negated;
|
|
break;
|
|
|
|
case PT_WORD:
|
|
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
|
- PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
|
|
+ if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
|
|
+ PRIV(ucp_gentype)[chartype] == ucp_N || c == CHAR_UNDERSCORE)
|
|
== (t == XCL_PROP))
|
|
return !negated;
|
|
break;
|
|
diff --git a/glib/pcre/ucp.h b/glib/pcre/ucp.h
|
|
index 59c3bec..53a48c9 100644
|
|
--- a/glib/pcre/ucp.h
|
|
+++ b/glib/pcre/ucp.h
|
|
@@ -10,6 +10,7 @@ the UCD access macros. New values that are added for new releases of Unicode
|
|
should always be at the end of each enum, for backwards compatibility. */
|
|
|
|
/* These are the general character categories. */
|
|
+#include "gunicode.h"
|
|
|
|
enum {
|
|
ucp_C, /* Other */
|
|
@@ -24,148 +25,148 @@ enum {
|
|
/* These are the particular character types. */
|
|
|
|
enum {
|
|
- ucp_Cc, /* Control */
|
|
- ucp_Cf, /* Format */
|
|
- ucp_Cn, /* Unassigned */
|
|
- ucp_Co, /* Private use */
|
|
- ucp_Cs, /* Surrogate */
|
|
- ucp_Ll, /* Lower case letter */
|
|
- ucp_Lm, /* Modifier letter */
|
|
- ucp_Lo, /* Other letter */
|
|
- ucp_Lt, /* Title case letter */
|
|
- ucp_Lu, /* Upper case letter */
|
|
- ucp_Mc, /* Spacing mark */
|
|
- ucp_Me, /* Enclosing mark */
|
|
- ucp_Mn, /* Non-spacing mark */
|
|
- ucp_Nd, /* Decimal number */
|
|
- ucp_Nl, /* Letter number */
|
|
- ucp_No, /* Other number */
|
|
- ucp_Pc, /* Connector punctuation */
|
|
- ucp_Pd, /* Dash punctuation */
|
|
- ucp_Pe, /* Close punctuation */
|
|
- ucp_Pf, /* Final punctuation */
|
|
- ucp_Pi, /* Initial punctuation */
|
|
- ucp_Po, /* Other punctuation */
|
|
- ucp_Ps, /* Open punctuation */
|
|
- ucp_Sc, /* Currency symbol */
|
|
- ucp_Sk, /* Modifier symbol */
|
|
- ucp_Sm, /* Mathematical symbol */
|
|
- ucp_So, /* Other symbol */
|
|
- ucp_Zl, /* Line separator */
|
|
- ucp_Zp, /* Paragraph separator */
|
|
- ucp_Zs /* Space separator */
|
|
+ ucp_Cc = G_UNICODE_CONTROL, /* Control */
|
|
+ ucp_Cf = G_UNICODE_FORMAT, /* Format */
|
|
+ ucp_Cn = G_UNICODE_UNASSIGNED, /* Unassigned */
|
|
+ ucp_Co = G_UNICODE_PRIVATE_USE, /* Private use */
|
|
+ ucp_Cs = G_UNICODE_SURROGATE, /* Surrogate */
|
|
+ ucp_Ll = G_UNICODE_LOWERCASE_LETTER, /* Lower case letter */
|
|
+ ucp_Lm = G_UNICODE_MODIFIER_LETTER, /* Modifier letter */
|
|
+ ucp_Lo = G_UNICODE_OTHER_LETTER, /* Other letter */
|
|
+ ucp_Lt = G_UNICODE_TITLECASE_LETTER, /* Title case letter */
|
|
+ ucp_Lu = G_UNICODE_UPPERCASE_LETTER, /* Upper case letter */
|
|
+ ucp_Mc = G_UNICODE_SPACING_MARK, /* Spacing mark */
|
|
+ ucp_Me = G_UNICODE_ENCLOSING_MARK, /* Enclosing mark */
|
|
+ ucp_Mn = G_UNICODE_NON_SPACING_MARK, /* Non-spacing mark */
|
|
+ ucp_Nd = G_UNICODE_DECIMAL_NUMBER, /* Decimal number */
|
|
+ ucp_Nl = G_UNICODE_LETTER_NUMBER, /* Letter number */
|
|
+ ucp_No = G_UNICODE_OTHER_NUMBER, /* Other number */
|
|
+ ucp_Pc = G_UNICODE_CONNECT_PUNCTUATION, /* Connector punctuation */
|
|
+ ucp_Pd = G_UNICODE_DASH_PUNCTUATION, /* Dash punctuation */
|
|
+ ucp_Pe = G_UNICODE_CLOSE_PUNCTUATION, /* Close punctuation */
|
|
+ ucp_Pf = G_UNICODE_FINAL_PUNCTUATION, /* Final punctuation */
|
|
+ ucp_Pi = G_UNICODE_INITIAL_PUNCTUATION, /* Initial punctuation */
|
|
+ ucp_Po = G_UNICODE_OTHER_PUNCTUATION, /* Other punctuation */
|
|
+ ucp_Ps = G_UNICODE_OPEN_PUNCTUATION, /* Open punctuation */
|
|
+ ucp_Sc = G_UNICODE_CURRENCY_SYMBOL, /* Currency symbol */
|
|
+ ucp_Sk = G_UNICODE_MODIFIER_SYMBOL, /* Modifier symbol */
|
|
+ ucp_Sm = G_UNICODE_MATH_SYMBOL, /* Mathematical symbol */
|
|
+ ucp_So = G_UNICODE_OTHER_SYMBOL, /* Other symbol */
|
|
+ ucp_Zl = G_UNICODE_LINE_SEPARATOR, /* Line separator */
|
|
+ ucp_Zp = G_UNICODE_PARAGRAPH_SEPARATOR, /* Paragraph separator */
|
|
+ ucp_Zs = G_UNICODE_SPACE_SEPARATOR /* Space separator */
|
|
};
|
|
|
|
/* These are the script identifications. */
|
|
|
|
enum {
|
|
- ucp_Arabic,
|
|
- ucp_Armenian,
|
|
- ucp_Bengali,
|
|
- ucp_Bopomofo,
|
|
- ucp_Braille,
|
|
- ucp_Buginese,
|
|
- ucp_Buhid,
|
|
- ucp_Canadian_Aboriginal,
|
|
- ucp_Cherokee,
|
|
- ucp_Common,
|
|
- ucp_Coptic,
|
|
- ucp_Cypriot,
|
|
- ucp_Cyrillic,
|
|
- ucp_Deseret,
|
|
- ucp_Devanagari,
|
|
- ucp_Ethiopic,
|
|
- ucp_Georgian,
|
|
- ucp_Glagolitic,
|
|
- ucp_Gothic,
|
|
- ucp_Greek,
|
|
- ucp_Gujarati,
|
|
- ucp_Gurmukhi,
|
|
- ucp_Han,
|
|
- ucp_Hangul,
|
|
- ucp_Hanunoo,
|
|
- ucp_Hebrew,
|
|
- ucp_Hiragana,
|
|
- ucp_Inherited,
|
|
- ucp_Kannada,
|
|
- ucp_Katakana,
|
|
- ucp_Kharoshthi,
|
|
- ucp_Khmer,
|
|
- ucp_Lao,
|
|
- ucp_Latin,
|
|
- ucp_Limbu,
|
|
- ucp_Linear_B,
|
|
- ucp_Malayalam,
|
|
- ucp_Mongolian,
|
|
- ucp_Myanmar,
|
|
- ucp_New_Tai_Lue,
|
|
- ucp_Ogham,
|
|
- ucp_Old_Italic,
|
|
- ucp_Old_Persian,
|
|
- ucp_Oriya,
|
|
- ucp_Osmanya,
|
|
- ucp_Runic,
|
|
- ucp_Shavian,
|
|
- ucp_Sinhala,
|
|
- ucp_Syloti_Nagri,
|
|
- ucp_Syriac,
|
|
- ucp_Tagalog,
|
|
- ucp_Tagbanwa,
|
|
- ucp_Tai_Le,
|
|
- ucp_Tamil,
|
|
- ucp_Telugu,
|
|
- ucp_Thaana,
|
|
- ucp_Thai,
|
|
- ucp_Tibetan,
|
|
- ucp_Tifinagh,
|
|
- ucp_Ugaritic,
|
|
- ucp_Yi,
|
|
+ ucp_Arabic = G_UNICODE_SCRIPT_ARABIC,
|
|
+ ucp_Armenian = G_UNICODE_SCRIPT_ARMENIAN,
|
|
+ ucp_Bengali = G_UNICODE_SCRIPT_BENGALI,
|
|
+ ucp_Bopomofo = G_UNICODE_SCRIPT_BOPOMOFO,
|
|
+ ucp_Braille = G_UNICODE_SCRIPT_BRAILLE,
|
|
+ ucp_Buginese = G_UNICODE_SCRIPT_BUGINESE,
|
|
+ ucp_Buhid = G_UNICODE_SCRIPT_BUHID,
|
|
+ ucp_Canadian_Aboriginal = G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL,
|
|
+ ucp_Cherokee = G_UNICODE_SCRIPT_CHEROKEE,
|
|
+ ucp_Common = G_UNICODE_SCRIPT_COMMON,
|
|
+ ucp_Coptic = G_UNICODE_SCRIPT_COPTIC,
|
|
+ ucp_Cypriot = G_UNICODE_SCRIPT_CYPRIOT,
|
|
+ ucp_Cyrillic = G_UNICODE_SCRIPT_CYRILLIC,
|
|
+ ucp_Deseret = G_UNICODE_SCRIPT_DESERET,
|
|
+ ucp_Devanagari = G_UNICODE_SCRIPT_DEVANAGARI,
|
|
+ ucp_Ethiopic = G_UNICODE_SCRIPT_ETHIOPIC,
|
|
+ ucp_Georgian = G_UNICODE_SCRIPT_GEORGIAN,
|
|
+ ucp_Glagolitic = G_UNICODE_SCRIPT_GLAGOLITIC,
|
|
+ ucp_Gothic = G_UNICODE_SCRIPT_GOTHIC,
|
|
+ ucp_Greek = G_UNICODE_SCRIPT_GREEK,
|
|
+ ucp_Gujarati = G_UNICODE_SCRIPT_GUJARATI,
|
|
+ ucp_Gurmukhi = G_UNICODE_SCRIPT_GURMUKHI,
|
|
+ ucp_Han = G_UNICODE_SCRIPT_HAN,
|
|
+ ucp_Hangul = G_UNICODE_SCRIPT_HANGUL,
|
|
+ ucp_Hanunoo = G_UNICODE_SCRIPT_HANUNOO,
|
|
+ ucp_Hebrew = G_UNICODE_SCRIPT_HEBREW,
|
|
+ ucp_Hiragana = G_UNICODE_SCRIPT_HIRAGANA,
|
|
+ ucp_Inherited = G_UNICODE_SCRIPT_INHERITED,
|
|
+ ucp_Kannada = G_UNICODE_SCRIPT_KANNADA,
|
|
+ ucp_Katakana = G_UNICODE_SCRIPT_KATAKANA,
|
|
+ ucp_Kharoshthi = G_UNICODE_SCRIPT_KHAROSHTHI,
|
|
+ ucp_Khmer = G_UNICODE_SCRIPT_KHMER,
|
|
+ ucp_Lao = G_UNICODE_SCRIPT_LAO,
|
|
+ ucp_Latin = G_UNICODE_SCRIPT_LATIN,
|
|
+ ucp_Limbu = G_UNICODE_SCRIPT_LIMBU,
|
|
+ ucp_Linear_B = G_UNICODE_SCRIPT_LINEAR_B,
|
|
+ ucp_Malayalam = G_UNICODE_SCRIPT_MALAYALAM,
|
|
+ ucp_Mongolian = G_UNICODE_SCRIPT_MONGOLIAN,
|
|
+ ucp_Myanmar = G_UNICODE_SCRIPT_MYANMAR,
|
|
+ ucp_New_Tai_Lue = G_UNICODE_SCRIPT_NEW_TAI_LUE,
|
|
+ ucp_Ogham = G_UNICODE_SCRIPT_OGHAM,
|
|
+ ucp_Old_Italic = G_UNICODE_SCRIPT_OLD_ITALIC,
|
|
+ ucp_Old_Persian = G_UNICODE_SCRIPT_OLD_PERSIAN,
|
|
+ ucp_Oriya = G_UNICODE_SCRIPT_ORIYA,
|
|
+ ucp_Osmanya = G_UNICODE_SCRIPT_OSMANYA,
|
|
+ ucp_Runic = G_UNICODE_SCRIPT_RUNIC,
|
|
+ ucp_Shavian = G_UNICODE_SCRIPT_SHAVIAN,
|
|
+ ucp_Sinhala = G_UNICODE_SCRIPT_SINHALA,
|
|
+ ucp_Syloti_Nagri = G_UNICODE_SCRIPT_SYLOTI_NAGRI,
|
|
+ ucp_Syriac = G_UNICODE_SCRIPT_SYRIAC,
|
|
+ ucp_Tagalog = G_UNICODE_SCRIPT_TAGALOG,
|
|
+ ucp_Tagbanwa = G_UNICODE_SCRIPT_TAGBANWA,
|
|
+ ucp_Tai_Le = G_UNICODE_SCRIPT_TAI_LE,
|
|
+ ucp_Tamil = G_UNICODE_SCRIPT_TAMIL,
|
|
+ ucp_Telugu = G_UNICODE_SCRIPT_TELUGU,
|
|
+ ucp_Thaana = G_UNICODE_SCRIPT_THAANA,
|
|
+ ucp_Thai = G_UNICODE_SCRIPT_THAI,
|
|
+ ucp_Tibetan = G_UNICODE_SCRIPT_TIBETAN,
|
|
+ ucp_Tifinagh = G_UNICODE_SCRIPT_TIFINAGH,
|
|
+ ucp_Ugaritic = G_UNICODE_SCRIPT_UGARITIC,
|
|
+ ucp_Yi = G_UNICODE_SCRIPT_YI,
|
|
/* New for Unicode 5.0: */
|
|
- ucp_Balinese,
|
|
- ucp_Cuneiform,
|
|
- ucp_Nko,
|
|
- ucp_Phags_Pa,
|
|
- ucp_Phoenician,
|
|
+ ucp_Balinese = G_UNICODE_SCRIPT_BALINESE,
|
|
+ ucp_Cuneiform = G_UNICODE_SCRIPT_CUNEIFORM,
|
|
+ ucp_Nko = G_UNICODE_SCRIPT_NKO,
|
|
+ ucp_Phags_Pa = G_UNICODE_SCRIPT_PHAGS_PA,
|
|
+ ucp_Phoenician = G_UNICODE_SCRIPT_PHOENICIAN,
|
|
/* New for Unicode 5.1: */
|
|
- ucp_Carian,
|
|
- ucp_Cham,
|
|
- ucp_Kayah_Li,
|
|
- ucp_Lepcha,
|
|
- ucp_Lycian,
|
|
- ucp_Lydian,
|
|
- ucp_Ol_Chiki,
|
|
- ucp_Rejang,
|
|
- ucp_Saurashtra,
|
|
- ucp_Sundanese,
|
|
- ucp_Vai,
|
|
+ ucp_Carian = G_UNICODE_SCRIPT_CARIAN,
|
|
+ ucp_Cham = G_UNICODE_SCRIPT_CHAM,
|
|
+ ucp_Kayah_Li = G_UNICODE_SCRIPT_KAYAH_LI,
|
|
+ ucp_Lepcha = G_UNICODE_SCRIPT_LEPCHA,
|
|
+ ucp_Lycian = G_UNICODE_SCRIPT_LYCIAN,
|
|
+ ucp_Lydian = G_UNICODE_SCRIPT_LYDIAN,
|
|
+ ucp_Ol_Chiki = G_UNICODE_SCRIPT_OL_CHIKI,
|
|
+ ucp_Rejang = G_UNICODE_SCRIPT_REJANG,
|
|
+ ucp_Saurashtra = G_UNICODE_SCRIPT_SAURASHTRA,
|
|
+ ucp_Sundanese = G_UNICODE_SCRIPT_SUNDANESE,
|
|
+ ucp_Vai = G_UNICODE_SCRIPT_VAI,
|
|
/* New for Unicode 5.2: */
|
|
- ucp_Avestan,
|
|
- ucp_Bamum,
|
|
- ucp_Egyptian_Hieroglyphs,
|
|
- ucp_Imperial_Aramaic,
|
|
- ucp_Inscriptional_Pahlavi,
|
|
- ucp_Inscriptional_Parthian,
|
|
- ucp_Javanese,
|
|
- ucp_Kaithi,
|
|
- ucp_Lisu,
|
|
- ucp_Meetei_Mayek,
|
|
- ucp_Old_South_Arabian,
|
|
- ucp_Old_Turkic,
|
|
- ucp_Samaritan,
|
|
- ucp_Tai_Tham,
|
|
- ucp_Tai_Viet,
|
|
+ ucp_Avestan = G_UNICODE_SCRIPT_AVESTAN,
|
|
+ ucp_Bamum = G_UNICODE_SCRIPT_BAMUM,
|
|
+ ucp_Egyptian_Hieroglyphs = G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS,
|
|
+ ucp_Imperial_Aramaic = G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC,
|
|
+ ucp_Inscriptional_Pahlavi = G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI,
|
|
+ ucp_Inscriptional_Parthian = G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN,
|
|
+ ucp_Javanese = G_UNICODE_SCRIPT_JAVANESE,
|
|
+ ucp_Kaithi = G_UNICODE_SCRIPT_KAITHI,
|
|
+ ucp_Lisu = G_UNICODE_SCRIPT_LISU,
|
|
+ ucp_Meetei_Mayek = G_UNICODE_SCRIPT_MEETEI_MAYEK,
|
|
+ ucp_Old_South_Arabian = G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN,
|
|
+ ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKIC,
|
|
+ ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN,
|
|
+ ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM,
|
|
+ ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET,
|
|
/* New for Unicode 6.0.0: */
|
|
- ucp_Batak,
|
|
- ucp_Brahmi,
|
|
- ucp_Mandaic,
|
|
+ ucp_Batak = G_UNICODE_SCRIPT_BATAK,
|
|
+ ucp_Brahmi = G_UNICODE_SCRIPT_BRAHMI,
|
|
+ ucp_Mandaic = G_UNICODE_SCRIPT_MANDAIC,
|
|
/* New for Unicode 6.1.0: */
|
|
- ucp_Chakma,
|
|
- ucp_Meroitic_Cursive,
|
|
- ucp_Meroitic_Hieroglyphs,
|
|
- ucp_Miao,
|
|
- ucp_Sharada,
|
|
- ucp_Sora_Sompeng,
|
|
- ucp_Takri
|
|
+ ucp_Chakma = G_UNICODE_SCRIPT_CHAKMA,
|
|
+ ucp_Meroitic_Cursive = G_UNICODE_SCRIPT_MEROITIC_CURSIVE,
|
|
+ ucp_Meroitic_Hieroglyphs = G_UNICODE_SCRIPT_MEROITIC_HIEROGLYPHS,
|
|
+ ucp_Miao = G_UNICODE_SCRIPT_MIAO,
|
|
+ ucp_Sharada = G_UNICODE_SCRIPT_SHARADA,
|
|
+ ucp_Sora_Sompeng = G_UNICODE_SCRIPT_SORA_SOMPENG,
|
|
+ ucp_Takri = G_UNICODE_SCRIPT_TAKRI,
|
|
};
|
|
|
|
#endif
|
|
--
|
|
1.7.5.1.217.g4e3aa.dirty
|
|
|