6666 lines
329 KiB
Diff
6666 lines
329 KiB
Diff
Backported from 5.5 for 5.4 by Remi Collet
|
|
|
|
|
|
diff -ru php-5.4.45/ext/pcre/pcrelib/config.h php55/php-5.5.31/ext/pcre/pcrelib/config.h
|
|
--- php-5.4.45/ext/pcre/pcrelib/config.h 2015-09-01 22:09:37.000000000 +0200
|
|
+++ php-5.5.31/ext/pcre/pcrelib/config.h 2016-01-06 10:36:49.000000000 +0100
|
|
@@ -302,6 +302,8 @@
|
|
*/
|
|
/* #undef NO_RECURSE */
|
|
|
|
+#define PARENS_NEST_LIMIT 250
|
|
+
|
|
/* Name of package */
|
|
#define PACKAGE "pcre"
|
|
|
|
diff -ru php54/php-5.4.45/ext/pcre/pcrelib/pcre_exec.c php55/php-5.5.31/ext/pcre/pcrelib/pcre_exec.c
|
|
--- php-5.4.45/ext/pcre/pcrelib/pcre_exec.c 2015-09-01 22:09:37.000000000 +0200
|
|
+++ php-5.5.31/ext/pcre/pcrelib/pcre_exec.c 2016-01-06 10:36:49.000000000 +0100
|
|
@@ -688,7 +688,7 @@
|
|
#define foc number
|
|
#define save_mark data
|
|
|
|
-/* These statements are here to stop the compiler complaining about unitialized
|
|
+/* These statements are here to stop the compiler complaining about uninitialized
|
|
variables. */
|
|
|
|
#ifdef SUPPORT_UCP
|
|
|
|
From ca02d9c2d6f9bea7bf8abe607f1ee9484b1d7b62 Mon Sep 17 00:00:00 2001
|
|
From: Stanislav Malyshev <stas@php.net>
|
|
Date: Sun, 31 Jan 2016 20:33:17 -0800
|
|
Subject: [PATCH] Upgrade bundled PCRE to 8.38
|
|
|
|
---
|
|
NEWS | 3 +
|
|
ext/pcre/pcrelib/ChangeLog | 176 ++
|
|
ext/pcre/pcrelib/NEWS | 8 +
|
|
ext/pcre/pcrelib/config.h | 11 +-
|
|
ext/pcre/pcrelib/doc/pcre.txt | 2130 +++++++++++-----------
|
|
ext/pcre/pcrelib/pcre.h | 4 +-
|
|
ext/pcre/pcrelib/pcre_compile.c | 334 +++-
|
|
ext/pcre/pcrelib/pcre_exec.c | 5 +-
|
|
ext/pcre/pcrelib/pcre_internal.h | 17 +-
|
|
ext/pcre/pcrelib/pcre_jit_compile.c | 77 +-
|
|
ext/pcre/pcrelib/pcre_study.c | 19 +-
|
|
ext/pcre/pcrelib/pcre_xclass.c | 2 +-
|
|
ext/pcre/pcrelib/sljit/sljitConfig.h | 9 +
|
|
ext/pcre/pcrelib/sljit/sljitConfigInternal.h | 13 +-
|
|
ext/pcre/pcrelib/sljit/sljitLir.c | 10 +-
|
|
ext/pcre/pcrelib/sljit/sljitLir.h | 128 +-
|
|
ext/pcre/pcrelib/sljit/sljitNativeARM_32.c | 27 +-
|
|
ext/pcre/pcrelib/sljit/sljitNativeARM_64.c | 48 +-
|
|
ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c | 58 +-
|
|
ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c | 15 +-
|
|
ext/pcre/pcrelib/sljit/sljitNativePPC_common.c | 23 +-
|
|
ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c | 19 +-
|
|
ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c | 311 ++--
|
|
ext/pcre/pcrelib/sljit/sljitNativeX86_common.c | 129 +-
|
|
ext/pcre/pcrelib/testdata/grepoutput | 12 +
|
|
ext/pcre/pcrelib/testdata/testinput1 | 13 +
|
|
ext/pcre/pcrelib/testdata/testinput11 | 4 +
|
|
ext/pcre/pcrelib/testdata/testinput12 | 17 +
|
|
ext/pcre/pcrelib/testdata/testinput14 | 2 +
|
|
ext/pcre/pcrelib/testdata/testinput17 | 2 +
|
|
ext/pcre/pcrelib/testdata/testinput2 | 139 ++
|
|
ext/pcre/pcrelib/testdata/testinput4 | 5 +
|
|
ext/pcre/pcrelib/testdata/testinput5 | 8 +
|
|
ext/pcre/pcrelib/testdata/testinput6 | 57 +
|
|
ext/pcre/pcrelib/testdata/testinput7 | 15 +
|
|
ext/pcre/pcrelib/testdata/testinput8 | 4 +
|
|
ext/pcre/pcrelib/testdata/testinputEBC | 3 +
|
|
ext/pcre/pcrelib/testdata/testoutput1 | 23 +
|
|
ext/pcre/pcrelib/testdata/testoutput11-16 | 50 +-
|
|
ext/pcre/pcrelib/testdata/testoutput11-32 | 50 +-
|
|
ext/pcre/pcrelib/testdata/testoutput11-8 | 50 +-
|
|
ext/pcre/pcrelib/testdata/testoutput12 | 25 +
|
|
ext/pcre/pcrelib/testdata/testoutput14 | 2 +
|
|
ext/pcre/pcrelib/testdata/testoutput17 | 2 +
|
|
ext/pcre/pcrelib/testdata/testoutput2 | 380 +++-
|
|
ext/pcre/pcrelib/testdata/testoutput4 | 6 +
|
|
ext/pcre/pcrelib/testdata/testoutput5 | 45 +
|
|
ext/pcre/pcrelib/testdata/testoutput6 | 96 +
|
|
ext/pcre/pcrelib/testdata/testoutput7 | 57 +-
|
|
ext/pcre/pcrelib/testdata/testoutput8 | 6 +
|
|
ext/pcre/pcrelib/testdata/testoutputEBC | 6 +
|
|
51 files changed, 3144 insertions(+), 1511 deletions(-)
|
|
|
|
diff --git a/ext/pcre/pcrelib/ChangeLog b/ext/pcre/pcrelib/ChangeLog
|
|
index 359b412..5e5bf18 100644
|
|
--- a/ext/pcre/pcrelib/ChangeLog
|
|
+++ b/ext/pcre/pcrelib/ChangeLog
|
|
@@ -1,6 +1,182 @@
|
|
ChangeLog for PCRE
|
|
------------------
|
|
|
|
+Note that the PCRE 8.xx series (PCRE1) is now in a bugfix-only state. All
|
|
+development is happening in the PCRE2 10.xx series.
|
|
+
|
|
+Version 8.38 23-November-2015
|
|
+-----------------------------
|
|
+
|
|
+1. If a group that contained a recursive back reference also contained a
|
|
+ forward reference subroutine call followed by a non-forward-reference
|
|
+ subroutine call, for example /.((?2)(?R)\1)()/, pcre2_compile() failed to
|
|
+ compile correct code, leading to undefined behaviour or an internally
|
|
+ detected error. This bug was discovered by the LLVM fuzzer.
|
|
+
|
|
+2. Quantification of certain items (e.g. atomic back references) could cause
|
|
+ incorrect code to be compiled when recursive forward references were
|
|
+ involved. For example, in this pattern: /(?1)()((((((\1++))\x85)+)|))/.
|
|
+ This bug was discovered by the LLVM fuzzer.
|
|
+
|
|
+3. A repeated conditional group whose condition was a reference by name caused
|
|
+ a buffer overflow if there was more than one group with the given name.
|
|
+ This bug was discovered by the LLVM fuzzer.
|
|
+
|
|
+4. A recursive back reference by name within a group that had the same name as
|
|
+ another group caused a buffer overflow. For example:
|
|
+ /(?J)(?'d'(?'d'\g{d}))/. This bug was discovered by the LLVM fuzzer.
|
|
+
|
|
+5. A forward reference by name to a group whose number is the same as the
|
|
+ current group, for example in this pattern: /(?|(\k'Pm')|(?'Pm'))/, caused
|
|
+ a buffer overflow at compile time. This bug was discovered by the LLVM
|
|
+ fuzzer.
|
|
+
|
|
+6. A lookbehind assertion within a set of mutually recursive subpatterns could
|
|
+ provoke a buffer overflow. This bug was discovered by the LLVM fuzzer.
|
|
+
|
|
+7. Another buffer overflow bug involved duplicate named groups with a
|
|
+ reference between their definition, with a group that reset capture
|
|
+ numbers, for example: /(?J:(?|(?'R')(\k'R')|((?'R'))))/. This has been
|
|
+ fixed by always allowing for more memory, even if not needed. (A proper fix
|
|
+ is implemented in PCRE2, but it involves more refactoring.)
|
|
+
|
|
+8. There was no check for integer overflow in subroutine calls such as (?123).
|
|
+
|
|
+9. The table entry for \l in EBCDIC environments was incorrect, leading to its
|
|
+ being treated as a literal 'l' instead of causing an error.
|
|
+
|
|
+10. There was a buffer overflow if pcre_exec() was called with an ovector of
|
|
+ size 1. This bug was found by american fuzzy lop.
|
|
+
|
|
+11. If a non-capturing group containing a conditional group that could match
|
|
+ an empty string was repeated, it was not identified as matching an empty
|
|
+ string itself. For example: /^(?:(?(1)x|)+)+$()/.
|
|
+
|
|
+12. In an EBCDIC environment, pcretest was mishandling the escape sequences
|
|
+ \a and \e in test subject lines.
|
|
+
|
|
+13. In an EBCDIC environment, \a in a pattern was converted to the ASCII
|
|
+ instead of the EBCDIC value.
|
|
+
|
|
+14. The handling of \c in an EBCDIC environment has been revised so that it is
|
|
+ now compatible with the specification in Perl's perlebcdic page.
|
|
+
|
|
+15. The EBCDIC character 0x41 is a non-breaking space, equivalent to 0xa0 in
|
|
+ ASCII/Unicode. This has now been added to the list of characters that are
|
|
+ recognized as white space in EBCDIC.
|
|
+
|
|
+16. When PCRE was compiled without UCP support, the use of \p and \P gave an
|
|
+ error (correctly) when used outside a class, but did not give an error
|
|
+ within a class.
|
|
+
|
|
+17. \h within a class was incorrectly compiled in EBCDIC environments.
|
|
+
|
|
+18. A pattern with an unmatched closing parenthesis that contained a backward
|
|
+ assertion which itself contained a forward reference caused buffer
|
|
+ overflow. And example pattern is: /(?=di(?<=(?1))|(?=(.))))/.
|
|
+
|
|
+19. JIT should return with error when the compiled pattern requires more stack
|
|
+ space than the maximum.
|
|
+
|
|
+20. A possessively repeated conditional group that could match an empty string,
|
|
+ for example, /(?(R))*+/, was incorrectly compiled.
|
|
+
|
|
+21. Fix infinite recursion in the JIT compiler when certain patterns such as
|
|
+ /(?:|a|){100}x/ are analysed.
|
|
+
|
|
+22. Some patterns with character classes involving [: and \\ were incorrectly
|
|
+ compiled and could cause reading from uninitialized memory or an incorrect
|
|
+ error diagnosis.
|
|
+
|
|
+23. Pathological patterns containing many nested occurrences of [: caused
|
|
+ pcre_compile() to run for a very long time.
|
|
+
|
|
+24. A conditional group with only one branch has an implicit empty alternative
|
|
+ branch and must therefore be treated as potentially matching an empty
|
|
+ string.
|
|
+
|
|
+25. If (?R was followed by - or + incorrect behaviour happened instead of a
|
|
+ diagnostic.
|
|
+
|
|
+26. Arrange to give up on finding the minimum matching length for overly
|
|
+ complex patterns.
|
|
+
|
|
+27. Similar to (4) above: in a pattern with duplicated named groups and an
|
|
+ occurrence of (?| it is possible for an apparently non-recursive back
|
|
+ reference to become recursive if a later named group with the relevant
|
|
+ number is encountered. This could lead to a buffer overflow. Wen Guanxing
|
|
+ from Venustech ADLAB discovered this bug.
|
|
+
|
|
+28. If pcregrep was given the -q option with -c or -l, or when handling a
|
|
+ binary file, it incorrectly wrote output to stdout.
|
|
+
|
|
+29. The JIT compiler did not restore the control verb head in case of *THEN
|
|
+ control verbs. This issue was found by Karl Skomski with a custom LLVM
|
|
+ fuzzer.
|
|
+
|
|
+30. Error messages for syntax errors following \g and \k were giving inaccurate
|
|
+ offsets in the pattern.
|
|
+
|
|
+31. Added a check for integer overflow in conditions (?(<digits>) and
|
|
+ (?(R<digits>). This omission was discovered by Karl Skomski with the LLVM
|
|
+ fuzzer.
|
|
+
|
|
+32. Handling recursive references such as (?2) when the reference is to a group
|
|
+ later in the pattern uses code that is very hacked about and error-prone.
|
|
+ It has been re-written for PCRE2. Here in PCRE1, a check has been added to
|
|
+ give an internal error if it is obvious that compiling has gone wrong.
|
|
+
|
|
+33. The JIT compiler should not check repeats after a {0,1} repeat byte code.
|
|
+ This issue was found by Karl Skomski with a custom LLVM fuzzer.
|
|
+
|
|
+34. The JIT compiler should restore the control chain for empty possessive
|
|
+ repeats. This issue was found by Karl Skomski with a custom LLVM fuzzer.
|
|
+
|
|
+35. Match limit check added to JIT recursion. This issue was found by Karl
|
|
+ Skomski with a custom LLVM fuzzer.
|
|
+
|
|
+36. Yet another case similar to 27 above has been circumvented by an
|
|
+ unconditional allocation of extra memory. This issue is fixed "properly" in
|
|
+ PCRE2 by refactoring the way references are handled. Wen Guanxing
|
|
+ from Venustech ADLAB discovered this bug.
|
|
+
|
|
+37. Fix two assertion fails in JIT. These issues were found by Karl Skomski
|
|
+ with a custom LLVM fuzzer.
|
|
+
|
|
+38. Fixed a corner case of range optimization in JIT.
|
|
+
|
|
+39. An incorrect error "overran compiling workspace" was given if there were
|
|
+ exactly enough group forward references such that the last one extended
|
|
+ into the workspace safety margin. The next one would have expanded the
|
|
+ workspace. The test for overflow was not including the safety margin.
|
|
+
|
|
+40. A match limit issue is fixed in JIT which was found by Karl Skomski
|
|
+ with a custom LLVM fuzzer.
|
|
+
|
|
+41. Remove the use of /dev/null in testdata/testinput2, because it doesn't
|
|
+ work under Windows. (Why has it taken so long for anyone to notice?)
|
|
+
|
|
+42. In a character class such as [\W\p{Any}] where both a negative-type escape
|
|
+ ("not a word character") and a property escape were present, the property
|
|
+ escape was being ignored.
|
|
+
|
|
+43. Fix crash caused by very long (*MARK) or (*THEN) names.
|
|
+
|
|
+44. A sequence such as [[:punct:]b] that is, a POSIX character class followed
|
|
+ by a single ASCII character in a class item, was incorrectly compiled in
|
|
+ UCP mode. The POSIX class got lost, but only if the single character
|
|
+ followed it.
|
|
+
|
|
+45. [:punct:] in UCP mode was matching some characters in the range 128-255
|
|
+ that should not have been matched.
|
|
+
|
|
+46. If [:^ascii:] or [:^xdigit:] or [:^cntrl:] are present in a non-negated
|
|
+ class, all characters with code points greater than 255 are in the class.
|
|
+ When a Unicode property was also in the class (if PCRE_UCP is set, escapes
|
|
+ such as \w are turned into Unicode properties), wide characters were not
|
|
+ correctly handled, and could fail to match.
|
|
+
|
|
+
|
|
Version 8.37 28-April-2015
|
|
--------------------------
|
|
|
|
diff --git a/ext/pcre/pcrelib/NEWS b/ext/pcre/pcrelib/NEWS
|
|
index 064bf27..7e42dcb 100644
|
|
--- a/ext/pcre/pcrelib/NEWS
|
|
+++ b/ext/pcre/pcrelib/NEWS
|
|
@@ -1,6 +1,14 @@
|
|
News about PCRE releases
|
|
------------------------
|
|
|
|
+Release 8.38 23-November-2015
|
|
+-----------------------------
|
|
+
|
|
+This is bug-fix release. Note that this library (now called PCRE1) is now being
|
|
+maintained for bug fixes only. New projects are advised to use the new PCRE2
|
|
+libraries.
|
|
+
|
|
+
|
|
Release 8.37 28-April-2015
|
|
--------------------------
|
|
|
|
diff --git a/ext/pcre/pcrelib/config.h b/ext/pcre/pcrelib/config.h
|
|
index ba06a17..0f7a9f7 100644
|
|
--- a/ext/pcre/pcrelib/config.h
|
|
+++ b/ext/pcre/pcrelib/config.h
|
|
@@ -234,8 +234,8 @@ them both to 0; an emulation function will be used. */
|
|
#define LINK_SIZE 2
|
|
#endif
|
|
|
|
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
|
|
- */
|
|
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
|
|
+/* This is ignored unless you are using libtool. */
|
|
#ifndef LT_OBJDIR
|
|
#define LT_OBJDIR ".libs/"
|
|
#endif
|
|
@@ -314,7 +314,7 @@ them both to 0; an emulation function will be used. */
|
|
#define PACKAGE_NAME "PCRE"
|
|
|
|
/* Define to the full name and version of this package. */
|
|
-#define PACKAGE_STRING "PCRE 8.37"
|
|
+#define PACKAGE_STRING "PCRE 8.38"
|
|
|
|
/* Define to the one symbol short name of this package. */
|
|
#define PACKAGE_TARNAME "pcre"
|
|
@@ -323,7 +323,7 @@ them both to 0; an emulation function will be used. */
|
|
#define PACKAGE_URL ""
|
|
|
|
/* Define to the version of this package. */
|
|
-#define PACKAGE_VERSION "8.37"
|
|
+#define PACKAGE_VERSION "8.38"
|
|
|
|
/* to make a symbol visible */
|
|
/* #undef PCRECPP_EXP_DECL */
|
|
@@ -439,7 +439,7 @@ them both to 0; an emulation function will be used. */
|
|
|
|
/* Version number of package */
|
|
#ifndef VERSION
|
|
-#define VERSION "8.37"
|
|
+#define VERSION "8.38"
|
|
#endif
|
|
|
|
/* Define to empty if `const' does not conform to ANSI C. */
|
|
@@ -451,4 +451,3 @@ them both to 0; an emulation function will be used. */
|
|
|
|
/* Define to `unsigned int' if <sys/types.h> does not define. */
|
|
/* #undef size_t */
|
|
-
|
|
diff --git a/ext/pcre/pcrelib/doc/pcre.txt b/ext/pcre/pcrelib/doc/pcre.txt
|
|
index ce27f4b..76a47c7 100644
|
|
--- a/ext/pcre/pcrelib/doc/pcre.txt
|
|
+++ b/ext/pcre/pcrelib/doc/pcre.txt
|
|
@@ -13,7 +13,18 @@ PCRE(3) Library Functions Manual PCRE(3)
|
|
|
|
|
|
NAME
|
|
- PCRE - Perl-compatible regular expressions
|
|
+ PCRE - Perl-compatible regular expressions (original API)
|
|
+
|
|
+PLEASE TAKE NOTE
|
|
+
|
|
+ This document relates to PCRE releases that use the original API, with
|
|
+ library names libpcre, libpcre16, and libpcre32. January 2015 saw the
|
|
+ first release of a new API, known as PCRE2, with release numbers start-
|
|
+ ing at 10.00 and library names libpcre2-8, libpcre2-16, and
|
|
+ libpcre2-32. The old libraries (now called PCRE1) are still being main-
|
|
+ tained for bug fixes, but there will be no new development. New
|
|
+ projects are advised to use the new PCRE2 libraries.
|
|
+
|
|
|
|
INTRODUCTION
|
|
|
|
@@ -179,8 +190,8 @@ AUTHOR
|
|
|
|
REVISION
|
|
|
|
- Last updated: 08 January 2014
|
|
- Copyright (c) 1997-2014 University of Cambridge.
|
|
+ Last updated: 10 February 2015
|
|
+ Copyright (c) 1997-2015 University of Cambridge.
|
|
------------------------------------------------------------------------------
|
|
|
|
|
|
@@ -4989,7 +5000,8 @@ BACKSLASH
|
|
appearance of non-printing characters, apart from the binary zero that
|
|
terminates a pattern, but when a pattern is being prepared by text
|
|
editing, it is often easier to use one of the following escape
|
|
- sequences than the binary character it represents:
|
|
+ sequences than the binary character it represents. In an ASCII or Uni-
|
|
+ code environment, these escapes are as follows:
|
|
|
|
\a alarm, that is, the BEL character (hex 07)
|
|
\cx "control-x", where x is any ASCII character
|
|
@@ -5005,55 +5017,67 @@ BACKSLASH
|
|
\x{hhh..} character with hex code hhh.. (non-JavaScript mode)
|
|
\uhhhh character with hex code hhhh (JavaScript mode only)
|
|
|
|
- The precise effect of \cx on ASCII characters is as follows: if x is a
|
|
- lower case letter, it is converted to upper case. Then bit 6 of the
|
|
+ The precise effect of \cx on ASCII characters is as follows: if x is a
|
|
+ lower case letter, it is converted to upper case. Then bit 6 of the
|
|
character (hex 40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A
|
|
- (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes
|
|
- hex 7B (; is 3B). If the data item (byte or 16-bit value) following \c
|
|
- has a value greater than 127, a compile-time error occurs. This locks
|
|
+ (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes
|
|
+ hex 7B (; is 3B). If the data item (byte or 16-bit value) following \c
|
|
+ has a value greater than 127, a compile-time error occurs. This locks
|
|
out non-ASCII characters in all modes.
|
|
|
|
- The \c facility was designed for use with ASCII characters, but with
|
|
- the extension to Unicode it is even less useful than it once was. It
|
|
- is, however, recognized when PCRE is compiled in EBCDIC mode, where
|
|
- data items are always bytes. In this mode, all values are valid after
|
|
- \c. If the next character is a lower case letter, it is converted to
|
|
- upper case. Then the 0xc0 bits of the byte are inverted. Thus \cA
|
|
- becomes hex 01, as in ASCII (A is C1), but because the EBCDIC letters
|
|
- are disjoint, \cZ becomes hex 29 (Z is E9), and other characters also
|
|
- generate different values.
|
|
-
|
|
- After \0 up to two further octal digits are read. If there are fewer
|
|
- than two digits, just those that are present are used. Thus the
|
|
- sequence \0\x\07 specifies two binary zeros followed by a BEL character
|
|
- (code value 7). Make sure you supply two digits after the initial zero
|
|
+ When PCRE is compiled in EBCDIC mode, \a, \e, \f, \n, \r, and \t gener-
|
|
+ ate the appropriate EBCDIC code values. The \c escape is processed as
|
|
+ specified for Perl in the perlebcdic document. The only characters that
|
|
+ are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?.
|
|
+ Any other character provokes a compile-time error. The sequence \@
|
|
+ encodes character code 0; the letters (in either case) encode charac-
|
|
+ ters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31
|
|
+ (hex 1B to hex 1F), and \? becomes either 255 (hex FF) or 95 (hex 5F).
|
|
+
|
|
+ Thus, apart from \?, these escapes generate the same character code
|
|
+ values as they do in an ASCII environment, though the meanings of the
|
|
+ values mostly differ. For example, \G always generates code value 7,
|
|
+ which is BEL in ASCII but DEL in EBCDIC.
|
|
+
|
|
+ The sequence \? generates DEL (127, hex 7F) in an ASCII environment,
|
|
+ but because 127 is not a control character in EBCDIC, Perl makes it
|
|
+ generate the APC character. Unfortunately, there are several variants
|
|
+ of EBCDIC. In most of them the APC character has the value 255 (hex
|
|
+ FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If
|
|
+ certain other characters have POSIX-BC values, PCRE makes \? generate
|
|
+ 95; otherwise it generates 255.
|
|
+
|
|
+ After \0 up to two further octal digits are read. If there are fewer
|
|
+ than two digits, just those that are present are used. Thus the
|
|
+ sequence \0\x\015 specifies two binary zeros followed by a CR character
|
|
+ (code value 13). Make sure you supply two digits after the initial zero
|
|
if the pattern character that follows is itself an octal digit.
|
|
|
|
- The escape \o must be followed by a sequence of octal digits, enclosed
|
|
- in braces. An error occurs if this is not the case. This escape is a
|
|
- recent addition to Perl; it provides way of specifying character code
|
|
- points as octal numbers greater than 0777, and it also allows octal
|
|
+ The escape \o must be followed by a sequence of octal digits, enclosed
|
|
+ in braces. An error occurs if this is not the case. This escape is a
|
|
+ recent addition to Perl; it provides way of specifying character code
|
|
+ points as octal numbers greater than 0777, and it also allows octal
|
|
numbers and back references to be unambiguously specified.
|
|
|
|
For greater clarity and unambiguity, it is best to avoid following \ by
|
|
a digit greater than zero. Instead, use \o{} or \x{} to specify charac-
|
|
- ter numbers, and \g{} to specify back references. The following para-
|
|
+ ter numbers, and \g{} to specify back references. The following para-
|
|
graphs describe the old, ambiguous syntax.
|
|
|
|
The handling of a backslash followed by a digit other than 0 is compli-
|
|
- cated, and Perl has changed in recent releases, causing PCRE also to
|
|
+ cated, and Perl has changed in recent releases, causing PCRE also to
|
|
change. Outside a character class, PCRE reads the digit and any follow-
|
|
- ing digits as a decimal number. If the number is less than 8, or if
|
|
- there have been at least that many previous capturing left parentheses
|
|
- in the expression, the entire sequence is taken as a back reference. A
|
|
- description of how this works is given later, following the discussion
|
|
+ ing digits as a decimal number. If the number is less than 8, or if
|
|
+ there have been at least that many previous capturing left parentheses
|
|
+ in the expression, the entire sequence is taken as a back reference. A
|
|
+ description of how this works is given later, following the discussion
|
|
of parenthesized subpatterns.
|
|
|
|
- Inside a character class, or if the decimal number following \ is
|
|
+ Inside a character class, or if the decimal number following \ is
|
|
greater than 7 and there have not been that many capturing subpatterns,
|
|
- PCRE handles \8 and \9 as the literal characters "8" and "9", and oth-
|
|
+ PCRE handles \8 and \9 as the literal characters "8" and "9", and oth-
|
|
erwise re-reads up to three octal digits following the backslash, using
|
|
- them to generate a data character. Any subsequent digits stand for
|
|
+ them to generate a data character. Any subsequent digits stand for
|
|
themselves. For example:
|
|
|
|
\040 is another way of writing an ASCII space
|
|
@@ -5071,31 +5095,31 @@ BACKSLASH
|
|
\81 is either a back reference, or the two
|
|
characters "8" and "1"
|
|
|
|
- Note that octal values of 100 or greater that are specified using this
|
|
- syntax must not be introduced by a leading zero, because no more than
|
|
+ Note that octal values of 100 or greater that are specified using this
|
|
+ syntax must not be introduced by a leading zero, because no more than
|
|
three octal digits are ever read.
|
|
|
|
- By default, after \x that is not followed by {, from zero to two hexa-
|
|
- decimal digits are read (letters can be in upper or lower case). Any
|
|
+ By default, after \x that is not followed by {, from zero to two hexa-
|
|
+ decimal digits are read (letters can be in upper or lower case). Any
|
|
number of hexadecimal digits may appear between \x{ and }. If a charac-
|
|
- ter other than a hexadecimal digit appears between \x{ and }, or if
|
|
+ ter other than a hexadecimal digit appears between \x{ and }, or if
|
|
there is no terminating }, an error occurs.
|
|
|
|
- If the PCRE_JAVASCRIPT_COMPAT option is set, the interpretation of \x
|
|
- is as just described only when it is followed by two hexadecimal dig-
|
|
- its. Otherwise, it matches a literal "x" character. In JavaScript
|
|
+ If the PCRE_JAVASCRIPT_COMPAT option is set, the interpretation of \x
|
|
+ is as just described only when it is followed by two hexadecimal dig-
|
|
+ its. Otherwise, it matches a literal "x" character. In JavaScript
|
|
mode, support for code points greater than 256 is provided by \u, which
|
|
- must be followed by four hexadecimal digits; otherwise it matches a
|
|
+ must be followed by four hexadecimal digits; otherwise it matches a
|
|
literal "u" character.
|
|
|
|
Characters whose value is less than 256 can be defined by either of the
|
|
- two syntaxes for \x (or by \u in JavaScript mode). There is no differ-
|
|
+ two syntaxes for \x (or by \u in JavaScript mode). There is no differ-
|
|
ence in the way they are handled. For example, \xdc is exactly the same
|
|
as \x{dc} (or \u00dc in JavaScript mode).
|
|
|
|
Constraints on character values
|
|
|
|
- Characters that are specified using octal or hexadecimal numbers are
|
|
+ Characters that are specified using octal or hexadecimal numbers are
|
|
limited to certain values, as follows:
|
|
|
|
8-bit non-UTF mode less than 0x100
|
|
@@ -5105,44 +5129,44 @@ BACKSLASH
|
|
32-bit non-UTF mode less than 0x100000000
|
|
32-bit UTF-32 mode less than 0x10ffff and a valid codepoint
|
|
|
|
- Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-
|
|
+ Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-
|
|
called "surrogate" codepoints), and 0xffef.
|
|
|
|
Escape sequences in character classes
|
|
|
|
All the sequences that define a single character value can be used both
|
|
- inside and outside character classes. In addition, inside a character
|
|
+ inside and outside character classes. In addition, inside a character
|
|
class, \b is interpreted as the backspace character (hex 08).
|
|
|
|
- \N is not allowed in a character class. \B, \R, and \X are not special
|
|
- inside a character class. Like other unrecognized escape sequences,
|
|
- they are treated as the literal characters "B", "R", and "X" by
|
|
- default, but cause an error if the PCRE_EXTRA option is set. Outside a
|
|
+ \N is not allowed in a character class. \B, \R, and \X are not special
|
|
+ inside a character class. Like other unrecognized escape sequences,
|
|
+ they are treated as the literal characters "B", "R", and "X" by
|
|
+ default, but cause an error if the PCRE_EXTRA option is set. Outside a
|
|
character class, these sequences have different meanings.
|
|
|
|
Unsupported escape sequences
|
|
|
|
- In Perl, the sequences \l, \L, \u, and \U are recognized by its string
|
|
- handler and used to modify the case of following characters. By
|
|
- default, PCRE does not support these escape sequences. However, if the
|
|
- PCRE_JAVASCRIPT_COMPAT option is set, \U matches a "U" character, and
|
|
+ In Perl, the sequences \l, \L, \u, and \U are recognized by its string
|
|
+ handler and used to modify the case of following characters. By
|
|
+ default, PCRE does not support these escape sequences. However, if the
|
|
+ PCRE_JAVASCRIPT_COMPAT option is set, \U matches a "U" character, and
|
|
\u can be used to define a character by code point, as described in the
|
|
previous section.
|
|
|
|
Absolute and relative back references
|
|
|
|
- The sequence \g followed by an unsigned or a negative number, option-
|
|
- ally enclosed in braces, is an absolute or relative back reference. A
|
|
+ The sequence \g followed by an unsigned or a negative number, option-
|
|
+ ally enclosed in braces, is an absolute or relative back reference. A
|
|
named back reference can be coded as \g{name}. Back references are dis-
|
|
cussed later, following the discussion of parenthesized subpatterns.
|
|
|
|
Absolute and relative subroutine calls
|
|
|
|
- For compatibility with Oniguruma, the non-Perl syntax \g followed by a
|
|
+ For compatibility with Oniguruma, the non-Perl syntax \g followed by a
|
|
name or a number enclosed either in angle brackets or single quotes, is
|
|
- an alternative syntax for referencing a subpattern as a "subroutine".
|
|
- Details are discussed later. Note that \g{...} (Perl syntax) and
|
|
- \g<...> (Oniguruma syntax) are not synonymous. The former is a back
|
|
+ an alternative syntax for referencing a subpattern as a "subroutine".
|
|
+ Details are discussed later. Note that \g{...} (Perl syntax) and
|
|
+ \g<...> (Oniguruma syntax) are not synonymous. The former is a back
|
|
reference; the latter is a subroutine call.
|
|
|
|
Generic character types
|
|
@@ -5161,59 +5185,59 @@ BACKSLASH
|
|
\W any "non-word" character
|
|
|
|
There is also the single sequence \N, which matches a non-newline char-
|
|
- acter. This is the same as the "." metacharacter when PCRE_DOTALL is
|
|
- not set. Perl also uses \N to match characters by name; PCRE does not
|
|
+ acter. This is the same as the "." metacharacter when PCRE_DOTALL is
|
|
+ not set. Perl also uses \N to match characters by name; PCRE does not
|
|
support this.
|
|
|
|
- Each pair of lower and upper case escape sequences partitions the com-
|
|
- plete set of characters into two disjoint sets. Any given character
|
|
- matches one, and only one, of each pair. The sequences can appear both
|
|
- inside and outside character classes. They each match one character of
|
|
- the appropriate type. If the current matching point is at the end of
|
|
- the subject string, all of them fail, because there is no character to
|
|
+ Each pair of lower and upper case escape sequences partitions the com-
|
|
+ plete set of characters into two disjoint sets. Any given character
|
|
+ matches one, and only one, of each pair. The sequences can appear both
|
|
+ inside and outside character classes. They each match one character of
|
|
+ the appropriate type. If the current matching point is at the end of
|
|
+ the subject string, all of them fail, because there is no character to
|
|
match.
|
|
|
|
- For compatibility with Perl, \s did not used to match the VT character
|
|
- (code 11), which made it different from the the POSIX "space" class.
|
|
- However, Perl added VT at release 5.18, and PCRE followed suit at
|
|
- release 8.34. The default \s characters are now HT (9), LF (10), VT
|
|
- (11), FF (12), CR (13), and space (32), which are defined as white
|
|
+ For compatibility with Perl, \s did not used to match the VT character
|
|
+ (code 11), which made it different from the the POSIX "space" class.
|
|
+ However, Perl added VT at release 5.18, and PCRE followed suit at
|
|
+ release 8.34. The default \s characters are now HT (9), LF (10), VT
|
|
+ (11), FF (12), CR (13), and space (32), which are defined as white
|
|
space in the "C" locale. This list may vary if locale-specific matching
|
|
- is taking place. For example, in some locales the "non-breaking space"
|
|
- character (\xA0) is recognized as white space, and in others the VT
|
|
+ is taking place. For example, in some locales the "non-breaking space"
|
|
+ character (\xA0) is recognized as white space, and in others the VT
|
|
character is not.
|
|
|
|
- A "word" character is an underscore or any character that is a letter
|
|
- or digit. By default, the definition of letters and digits is con-
|
|
- trolled by PCRE's low-valued character tables, and may vary if locale-
|
|
- specific matching is taking place (see "Locale support" in the pcreapi
|
|
- page). For example, in a French locale such as "fr_FR" in Unix-like
|
|
- systems, or "french" in Windows, some character codes greater than 127
|
|
- are used for accented letters, and these are then matched by \w. The
|
|
+ A "word" character is an underscore or any character that is a letter
|
|
+ or digit. By default, the definition of letters and digits is con-
|
|
+ trolled by PCRE's low-valued character tables, and may vary if locale-
|
|
+ specific matching is taking place (see "Locale support" in the pcreapi
|
|
+ page). For example, in a French locale such as "fr_FR" in Unix-like
|
|
+ systems, or "french" in Windows, some character codes greater than 127
|
|
+ are used for accented letters, and these are then matched by \w. The
|
|
use of locales with Unicode is discouraged.
|
|
|
|
- By default, characters whose code points are greater than 127 never
|
|
+ By default, characters whose code points are greater than 127 never
|
|
match \d, \s, or \w, and always match \D, \S, and \W, although this may
|
|
- vary for characters in the range 128-255 when locale-specific matching
|
|
- is happening. These escape sequences retain their original meanings
|
|
- from before Unicode support was available, mainly for efficiency rea-
|
|
- sons. If PCRE is compiled with Unicode property support, and the
|
|
- PCRE_UCP option is set, the behaviour is changed so that Unicode prop-
|
|
+ vary for characters in the range 128-255 when locale-specific matching
|
|
+ is happening. These escape sequences retain their original meanings
|
|
+ from before Unicode support was available, mainly for efficiency rea-
|
|
+ sons. If PCRE is compiled with Unicode property support, and the
|
|
+ PCRE_UCP option is set, the behaviour is changed so that Unicode prop-
|
|
erties are used to determine character types, as follows:
|
|
|
|
\d any character that matches \p{Nd} (decimal digit)
|
|
\s any character that matches \p{Z} or \h or \v
|
|
\w any character that matches \p{L} or \p{N}, plus underscore
|
|
|
|
- The upper case escapes match the inverse sets of characters. Note that
|
|
- \d matches only decimal digits, whereas \w matches any Unicode digit,
|
|
- as well as any Unicode letter, and underscore. Note also that PCRE_UCP
|
|
- affects \b, and \B because they are defined in terms of \w and \W.
|
|
+ The upper case escapes match the inverse sets of characters. Note that
|
|
+ \d matches only decimal digits, whereas \w matches any Unicode digit,
|
|
+ as well as any Unicode letter, and underscore. Note also that PCRE_UCP
|
|
+ affects \b, and \B because they are defined in terms of \w and \W.
|
|
Matching these sequences is noticeably slower when PCRE_UCP is set.
|
|
|
|
- The sequences \h, \H, \v, and \V are features that were added to Perl
|
|
- at release 5.10. In contrast to the other sequences, which match only
|
|
- ASCII characters by default, these always match certain high-valued
|
|
+ The sequences \h, \H, \v, and \V are features that were added to Perl
|
|
+ at release 5.10. In contrast to the other sequences, which match only
|
|
+ ASCII characters by default, these always match certain high-valued
|
|
code points, whether or not PCRE_UCP is set. The horizontal space char-
|
|
acters are:
|
|
|
|
@@ -5252,110 +5276,110 @@ BACKSLASH
|
|
|
|
Newline sequences
|
|
|
|
- Outside a character class, by default, the escape sequence \R matches
|
|
- any Unicode newline sequence. In 8-bit non-UTF-8 mode \R is equivalent
|
|
+ Outside a character class, by default, the escape sequence \R matches
|
|
+ any Unicode newline sequence. In 8-bit non-UTF-8 mode \R is equivalent
|
|
to the following:
|
|
|
|
(?>\r\n|\n|\x0b|\f|\r|\x85)
|
|
|
|
- This is an example of an "atomic group", details of which are given
|
|
+ This is an example of an "atomic group", details of which are given
|
|
below. This particular group matches either the two-character sequence
|
|
- CR followed by LF, or one of the single characters LF (linefeed,
|
|
- U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (car-
|
|
- riage return, U+000D), or NEL (next line, U+0085). The two-character
|
|
+ CR followed by LF, or one of the single characters LF (linefeed,
|
|
+ U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (car-
|
|
+ riage return, U+000D), or NEL (next line, U+0085). The two-character
|
|
sequence is treated as a single unit that cannot be split.
|
|
|
|
- In other modes, two additional characters whose codepoints are greater
|
|
+ In other modes, two additional characters whose codepoints are greater
|
|
than 255 are added: LS (line separator, U+2028) and PS (paragraph sepa-
|
|
- rator, U+2029). Unicode character property support is not needed for
|
|
+ rator, U+2029). Unicode character property support is not needed for
|
|
these characters to be recognized.
|
|
|
|
It is possible to restrict \R to match only CR, LF, or CRLF (instead of
|
|
- the complete set of Unicode line endings) by setting the option
|
|
+ the complete set of Unicode line endings) by setting the option
|
|
PCRE_BSR_ANYCRLF either at compile time or when the pattern is matched.
|
|
(BSR is an abbrevation for "backslash R".) This can be made the default
|
|
- when PCRE is built; if this is the case, the other behaviour can be
|
|
- requested via the PCRE_BSR_UNICODE option. It is also possible to
|
|
- specify these settings by starting a pattern string with one of the
|
|
+ when PCRE is built; if this is the case, the other behaviour can be
|
|
+ requested via the PCRE_BSR_UNICODE option. It is also possible to
|
|
+ specify these settings by starting a pattern string with one of the
|
|
following sequences:
|
|
|
|
(*BSR_ANYCRLF) CR, LF, or CRLF only
|
|
(*BSR_UNICODE) any Unicode newline sequence
|
|
|
|
These override the default and the options given to the compiling func-
|
|
- tion, but they can themselves be overridden by options given to a
|
|
- matching function. Note that these special settings, which are not
|
|
- Perl-compatible, are recognized only at the very start of a pattern,
|
|
- and that they must be in upper case. If more than one of them is
|
|
- present, the last one is used. They can be combined with a change of
|
|
+ tion, but they can themselves be overridden by options given to a
|
|
+ matching function. Note that these special settings, which are not
|
|
+ Perl-compatible, are recognized only at the very start of a pattern,
|
|
+ and that they must be in upper case. If more than one of them is
|
|
+ present, the last one is used. They can be combined with a change of
|
|
newline convention; for example, a pattern can start with:
|
|
|
|
(*ANY)(*BSR_ANYCRLF)
|
|
|
|
- They can also be combined with the (*UTF8), (*UTF16), (*UTF32), (*UTF)
|
|
+ They can also be combined with the (*UTF8), (*UTF16), (*UTF32), (*UTF)
|
|
or (*UCP) special sequences. Inside a character class, \R is treated as
|
|
- an unrecognized escape sequence, and so matches the letter "R" by
|
|
+ an unrecognized escape sequence, and so matches the letter "R" by
|
|
default, but causes an error if PCRE_EXTRA is set.
|
|
|
|
Unicode character properties
|
|
|
|
When PCRE is built with Unicode character property support, three addi-
|
|
- tional escape sequences that match characters with specific properties
|
|
- are available. When in 8-bit non-UTF-8 mode, these sequences are of
|
|
- course limited to testing characters whose codepoints are less than
|
|
+ tional escape sequences that match characters with specific properties
|
|
+ are available. When in 8-bit non-UTF-8 mode, these sequences are of
|
|
+ course limited to testing characters whose codepoints are less than
|
|
256, but they do work in this mode. The extra escape sequences are:
|
|
|
|
\p{xx} a character with the xx property
|
|
\P{xx} a character without the xx property
|
|
\X a Unicode extended grapheme cluster
|
|
|
|
- The property names represented by xx above are limited to the Unicode
|
|
+ The property names represented by xx above are limited to the Unicode
|
|
script names, the general category properties, "Any", which matches any
|
|
- character (including newline), and some special PCRE properties
|
|
- (described in the next section). Other Perl properties such as "InMu-
|
|
- sicalSymbols" are not currently supported by PCRE. Note that \P{Any}
|
|
+ character (including newline), and some special PCRE properties
|
|
+ (described in the next section). Other Perl properties such as "InMu-
|
|
+ sicalSymbols" are not currently supported by PCRE. Note that \P{Any}
|
|
does not match any characters, so always causes a match failure.
|
|
|
|
Sets of Unicode characters are defined as belonging to certain scripts.
|
|
- A character from one of these sets can be matched using a script name.
|
|
+ A character from one of these sets can be matched using a script name.
|
|
For example:
|
|
|
|
\p{Greek}
|
|
\P{Han}
|
|
|
|
- Those that are not part of an identified script are lumped together as
|
|
+ Those that are not part of an identified script are lumped together as
|
|
"Common". The current list of scripts is:
|
|
|
|
- Arabic, Armenian, Avestan, Balinese, Bamum, Bassa_Vah, Batak, Bengali,
|
|
- Bopomofo, Brahmi, Braille, Buginese, Buhid, Canadian_Aboriginal, Car-
|
|
+ Arabic, Armenian, Avestan, Balinese, Bamum, Bassa_Vah, Batak, Bengali,
|
|
+ Bopomofo, Brahmi, Braille, Buginese, Buhid, Canadian_Aboriginal, Car-
|
|
ian, Caucasian_Albanian, Chakma, Cham, Cherokee, Common, Coptic, Cunei-
|
|
form, Cypriot, Cyrillic, Deseret, Devanagari, Duployan, Egyptian_Hiero-
|
|
glyphs, Elbasan, Ethiopic, Georgian, Glagolitic, Gothic, Grantha,
|
|
- Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, Hiragana,
|
|
- Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip-
|
|
- tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li,
|
|
- Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Latin, Lepcha, Limbu, Lin-
|
|
- ear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani, Malayalam, Mandaic,
|
|
- Manichaean, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive,
|
|
- Meroitic_Hieroglyphs, Miao, Modi, Mongolian, Mro, Myanmar, Nabataean,
|
|
- New_Tai_Lue, Nko, Ogham, Ol_Chiki, Old_Italic, Old_North_Arabian,
|
|
+ Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, Hiragana,
|
|
+ Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip-
|
|
+ tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li,
|
|
+ Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Latin, Lepcha, Limbu, Lin-
|
|
+ ear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani, Malayalam, Mandaic,
|
|
+ Manichaean, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive,
|
|
+ Meroitic_Hieroglyphs, Miao, Modi, Mongolian, Mro, Myanmar, Nabataean,
|
|
+ New_Tai_Lue, Nko, Ogham, Ol_Chiki, Old_Italic, Old_North_Arabian,
|
|
Old_Permic, Old_Persian, Old_South_Arabian, Old_Turkic, Oriya, Osmanya,
|
|
Pahawh_Hmong, Palmyrene, Pau_Cin_Hau, Phags_Pa, Phoenician,
|
|
- Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra, Sharada, Sha-
|
|
- vian, Siddham, Sinhala, Sora_Sompeng, Sundanese, Syloti_Nagri, Syriac,
|
|
- Tagalog, Tagbanwa, Tai_Le, Tai_Tham, Tai_Viet, Takri, Tamil, Telugu,
|
|
- Thaana, Thai, Tibetan, Tifinagh, Tirhuta, Ugaritic, Vai, Warang_Citi,
|
|
+ Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra, Sharada, Sha-
|
|
+ vian, Siddham, Sinhala, Sora_Sompeng, Sundanese, Syloti_Nagri, Syriac,
|
|
+ Tagalog, Tagbanwa, Tai_Le, Tai_Tham, Tai_Viet, Takri, Tamil, Telugu,
|
|
+ Thaana, Thai, Tibetan, Tifinagh, Tirhuta, Ugaritic, Vai, Warang_Citi,
|
|
Yi.
|
|
|
|
Each character has exactly one Unicode general category property, spec-
|
|
- ified by a two-letter abbreviation. For compatibility with Perl, nega-
|
|
- tion can be specified by including a circumflex between the opening
|
|
- brace and the property name. For example, \p{^Lu} is the same as
|
|
+ ified by a two-letter abbreviation. For compatibility with Perl, nega-
|
|
+ tion can be specified by including a circumflex between the opening
|
|
+ brace and the property name. For example, \p{^Lu} is the same as
|
|
\P{Lu}.
|
|
|
|
If only one letter is specified with \p or \P, it includes all the gen-
|
|
- eral category properties that start with that letter. In this case, in
|
|
- the absence of negation, the curly brackets in the escape sequence are
|
|
+ eral category properties that start with that letter. In this case, in
|
|
+ the absence of negation, the curly brackets in the escape sequence are
|
|
optional; these two examples have the same effect:
|
|
|
|
\p{L}
|
|
@@ -5407,73 +5431,73 @@ BACKSLASH
|
|
Zp Paragraph separator
|
|
Zs Space separator
|
|
|
|
- The special property L& is also supported: it matches a character that
|
|
- has the Lu, Ll, or Lt property, in other words, a letter that is not
|
|
+ The special property L& is also supported: it matches a character that
|
|
+ has the Lu, Ll, or Lt property, in other words, a letter that is not
|
|
classified as a modifier or "other".
|
|
|
|
- The Cs (Surrogate) property applies only to characters in the range
|
|
- U+D800 to U+DFFF. Such characters are not valid in Unicode strings and
|
|
- so cannot be tested by PCRE, unless UTF validity checking has been
|
|
+ The Cs (Surrogate) property applies only to characters in the range
|
|
+ U+D800 to U+DFFF. Such characters are not valid in Unicode strings and
|
|
+ so cannot be tested by PCRE, unless UTF validity checking has been
|
|
turned off (see the discussion of PCRE_NO_UTF8_CHECK,
|
|
- PCRE_NO_UTF16_CHECK and PCRE_NO_UTF32_CHECK in the pcreapi page). Perl
|
|
+ PCRE_NO_UTF16_CHECK and PCRE_NO_UTF32_CHECK in the pcreapi page). Perl
|
|
does not support the Cs property.
|
|
|
|
- The long synonyms for property names that Perl supports (such as
|
|
- \p{Letter}) are not supported by PCRE, nor is it permitted to prefix
|
|
+ The long synonyms for property names that Perl supports (such as
|
|
+ \p{Letter}) are not supported by PCRE, nor is it permitted to prefix
|
|
any of these properties with "Is".
|
|
|
|
No character that is in the Unicode table has the Cn (unassigned) prop-
|
|
erty. Instead, this property is assumed for any code point that is not
|
|
in the Unicode table.
|
|
|
|
- Specifying caseless matching does not affect these escape sequences.
|
|
- For example, \p{Lu} always matches only upper case letters. This is
|
|
+ Specifying caseless matching does not affect these escape sequences.
|
|
+ For example, \p{Lu} always matches only upper case letters. This is
|
|
different from the behaviour of current versions of Perl.
|
|
|
|
- Matching characters by Unicode property is not fast, because PCRE has
|
|
- to do a multistage table lookup in order to find a character's prop-
|
|
+ Matching characters by Unicode property is not fast, because PCRE has
|
|
+ to do a multistage table lookup in order to find a character's prop-
|
|
erty. That is why the traditional escape sequences such as \d and \w do
|
|
not use Unicode properties in PCRE by default, though you can make them
|
|
- do so by setting the PCRE_UCP option or by starting the pattern with
|
|
+ do so by setting the PCRE_UCP option or by starting the pattern with
|
|
(*UCP).
|
|
|
|
Extended grapheme clusters
|
|
|
|
- The \X escape matches any number of Unicode characters that form an
|
|
+ The \X escape matches any number of Unicode characters that form an
|
|
"extended grapheme cluster", and treats the sequence as an atomic group
|
|
- (see below). Up to and including release 8.31, PCRE matched an ear-
|
|
+ (see below). Up to and including release 8.31, PCRE matched an ear-
|
|
lier, simpler definition that was equivalent to
|
|
|
|
(?>\PM\pM*)
|
|
|
|
- That is, it matched a character without the "mark" property, followed
|
|
- by zero or more characters with the "mark" property. Characters with
|
|
- the "mark" property are typically non-spacing accents that affect the
|
|
+ That is, it matched a character without the "mark" property, followed
|
|
+ by zero or more characters with the "mark" property. Characters with
|
|
+ the "mark" property are typically non-spacing accents that affect the
|
|
preceding character.
|
|
|
|
- This simple definition was extended in Unicode to include more compli-
|
|
- cated kinds of composite character by giving each character a grapheme
|
|
- breaking property, and creating rules that use these properties to
|
|
- define the boundaries of extended grapheme clusters. In releases of
|
|
+ This simple definition was extended in Unicode to include more compli-
|
|
+ cated kinds of composite character by giving each character a grapheme
|
|
+ breaking property, and creating rules that use these properties to
|
|
+ define the boundaries of extended grapheme clusters. In releases of
|
|
PCRE later than 8.31, \X matches one of these clusters.
|
|
|
|
- \X always matches at least one character. Then it decides whether to
|
|
+ \X always matches at least one character. Then it decides whether to
|
|
add additional characters according to the following rules for ending a
|
|
cluster:
|
|
|
|
1. End at the end of the subject string.
|
|
|
|
- 2. Do not end between CR and LF; otherwise end after any control char-
|
|
+ 2. Do not end between CR and LF; otherwise end after any control char-
|
|
acter.
|
|
|
|
- 3. Do not break Hangul (a Korean script) syllable sequences. Hangul
|
|
- characters are of five types: L, V, T, LV, and LVT. An L character may
|
|
- be followed by an L, V, LV, or LVT character; an LV or V character may
|
|
+ 3. Do not break Hangul (a Korean script) syllable sequences. Hangul
|
|
+ characters are of five types: L, V, T, LV, and LVT. An L character may
|
|
+ be followed by an L, V, LV, or LVT character; an LV or V character may
|
|
be followed by a V or T character; an LVT or T character may be follwed
|
|
only by a T character.
|
|
|
|
- 4. Do not end before extending characters or spacing marks. Characters
|
|
- with the "mark" property always have the "extend" grapheme breaking
|
|
+ 4. Do not end before extending characters or spacing marks. Characters
|
|
+ with the "mark" property always have the "extend" grapheme breaking
|
|
property.
|
|
|
|
5. Do not end after prepend characters.
|
|
@@ -5482,9 +5506,9 @@ BACKSLASH
|
|
|
|
PCRE's additional properties
|
|
|
|
- As well as the standard Unicode properties described above, PCRE sup-
|
|
- ports four more that make it possible to convert traditional escape
|
|
- sequences such as \w and \s to use Unicode properties. PCRE uses these
|
|
+ As well as the standard Unicode properties described above, PCRE sup-
|
|
+ ports four more that make it possible to convert traditional escape
|
|
+ sequences such as \w and \s to use Unicode properties. PCRE uses these
|
|
non-standard, non-Perl properties internally when PCRE_UCP is set. How-
|
|
ever, they may also be used explicitly. These properties are:
|
|
|
|
@@ -5493,54 +5517,54 @@ BACKSLASH
|
|
Xsp Any Perl space character
|
|
Xwd Any Perl "word" character
|
|
|
|
- Xan matches characters that have either the L (letter) or the N (num-
|
|
- ber) property. Xps matches the characters tab, linefeed, vertical tab,
|
|
- form feed, or carriage return, and any other character that has the Z
|
|
- (separator) property. Xsp is the same as Xps; it used to exclude ver-
|
|
- tical tab, for Perl compatibility, but Perl changed, and so PCRE fol-
|
|
- lowed at release 8.34. Xwd matches the same characters as Xan, plus
|
|
+ Xan matches characters that have either the L (letter) or the N (num-
|
|
+ ber) property. Xps matches the characters tab, linefeed, vertical tab,
|
|
+ form feed, or carriage return, and any other character that has the Z
|
|
+ (separator) property. Xsp is the same as Xps; it used to exclude ver-
|
|
+ tical tab, for Perl compatibility, but Perl changed, and so PCRE fol-
|
|
+ lowed at release 8.34. Xwd matches the same characters as Xan, plus
|
|
underscore.
|
|
|
|
- There is another non-standard property, Xuc, which matches any charac-
|
|
- ter that can be represented by a Universal Character Name in C++ and
|
|
- other programming languages. These are the characters $, @, ` (grave
|
|
- accent), and all characters with Unicode code points greater than or
|
|
- equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that
|
|
- most base (ASCII) characters are excluded. (Universal Character Names
|
|
- are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit.
|
|
+ There is another non-standard property, Xuc, which matches any charac-
|
|
+ ter that can be represented by a Universal Character Name in C++ and
|
|
+ other programming languages. These are the characters $, @, ` (grave
|
|
+ accent), and all characters with Unicode code points greater than or
|
|
+ equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that
|
|
+ most base (ASCII) characters are excluded. (Universal Character Names
|
|
+ are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit.
|
|
Note that the Xuc property does not match these sequences but the char-
|
|
acters that they represent.)
|
|
|
|
Resetting the match start
|
|
|
|
- The escape sequence \K causes any previously matched characters not to
|
|
+ The escape sequence \K causes any previously matched characters not to
|
|
be included in the final matched sequence. For example, the pattern:
|
|
|
|
foo\Kbar
|
|
|
|
- matches "foobar", but reports that it has matched "bar". This feature
|
|
- is similar to a lookbehind assertion (described below). However, in
|
|
- this case, the part of the subject before the real match does not have
|
|
- to be of fixed length, as lookbehind assertions do. The use of \K does
|
|
- not interfere with the setting of captured substrings. For example,
|
|
+ matches "foobar", but reports that it has matched "bar". This feature
|
|
+ is similar to a lookbehind assertion (described below). However, in
|
|
+ this case, the part of the subject before the real match does not have
|
|
+ to be of fixed length, as lookbehind assertions do. The use of \K does
|
|
+ not interfere with the setting of captured substrings. For example,
|
|
when the pattern
|
|
|
|
(foo)\Kbar
|
|
|
|
matches "foobar", the first substring is still set to "foo".
|
|
|
|
- Perl documents that the use of \K within assertions is "not well
|
|
- defined". In PCRE, \K is acted upon when it occurs inside positive
|
|
- assertions, but is ignored in negative assertions. Note that when a
|
|
- pattern such as (?=ab\K) matches, the reported start of the match can
|
|
+ Perl documents that the use of \K within assertions is "not well
|
|
+ defined". In PCRE, \K is acted upon when it occurs inside positive
|
|
+ assertions, but is ignored in negative assertions. Note that when a
|
|
+ pattern such as (?=ab\K) matches, the reported start of the match can
|
|
be greater than the end of the match.
|
|
|
|
Simple assertions
|
|
|
|
- The final use of backslash is for certain simple assertions. An asser-
|
|
- tion specifies a condition that has to be met at a particular point in
|
|
- a match, without consuming any characters from the subject string. The
|
|
- use of subpatterns for more complicated assertions is described below.
|
|
+ The final use of backslash is for certain simple assertions. An asser-
|
|
+ tion specifies a condition that has to be met at a particular point in
|
|
+ a match, without consuming any characters from the subject string. The
|
|
+ use of subpatterns for more complicated assertions is described below.
|
|
The backslashed assertions are:
|
|
|
|
\b matches at a word boundary
|
|
@@ -5551,161 +5575,161 @@ BACKSLASH
|
|
\z matches only at the end of the subject
|
|
\G matches at the first matching position in the subject
|
|
|
|
- Inside a character class, \b has a different meaning; it matches the
|
|
- backspace character. If any other of these assertions appears in a
|
|
- character class, by default it matches the corresponding literal char-
|
|
+ Inside a character class, \b has a different meaning; it matches the
|
|
+ backspace character. If any other of these assertions appears in a
|
|
+ character class, by default it matches the corresponding literal char-
|
|
acter (for example, \B matches the letter B). However, if the
|
|
- PCRE_EXTRA option is set, an "invalid escape sequence" error is gener-
|
|
+ PCRE_EXTRA option is set, an "invalid escape sequence" error is gener-
|
|
ated instead.
|
|
|
|
- A word boundary is a position in the subject string where the current
|
|
- character and the previous character do not both match \w or \W (i.e.
|
|
- one matches \w and the other matches \W), or the start or end of the
|
|
- string if the first or last character matches \w, respectively. In a
|
|
- UTF mode, the meanings of \w and \W can be changed by setting the
|
|
- PCRE_UCP option. When this is done, it also affects \b and \B. Neither
|
|
- PCRE nor Perl has a separate "start of word" or "end of word" metase-
|
|
- quence. However, whatever follows \b normally determines which it is.
|
|
+ A word boundary is a position in the subject string where the current
|
|
+ character and the previous character do not both match \w or \W (i.e.
|
|
+ one matches \w and the other matches \W), or the start or end of the
|
|
+ string if the first or last character matches \w, respectively. In a
|
|
+ UTF mode, the meanings of \w and \W can be changed by setting the
|
|
+ PCRE_UCP option. When this is done, it also affects \b and \B. Neither
|
|
+ PCRE nor Perl has a separate "start of word" or "end of word" metase-
|
|
+ quence. However, whatever follows \b normally determines which it is.
|
|
For example, the fragment \ba matches "a" at the start of a word.
|
|
|
|
- The \A, \Z, and \z assertions differ from the traditional circumflex
|
|
+ The \A, \Z, and \z assertions differ from the traditional circumflex
|
|
and dollar (described in the next section) in that they only ever match
|
|
- at the very start and end of the subject string, whatever options are
|
|
- set. Thus, they are independent of multiline mode. These three asser-
|
|
+ at the very start and end of the subject string, whatever options are
|
|
+ set. Thus, they are independent of multiline mode. These three asser-
|
|
tions are not affected by the PCRE_NOTBOL or PCRE_NOTEOL options, which
|
|
- affect only the behaviour of the circumflex and dollar metacharacters.
|
|
- However, if the startoffset argument of pcre_exec() is non-zero, indi-
|
|
+ affect only the behaviour of the circumflex and dollar metacharacters.
|
|
+ However, if the startoffset argument of pcre_exec() is non-zero, indi-
|
|
cating that matching is to start at a point other than the beginning of
|
|
- the subject, \A can never match. The difference between \Z and \z is
|
|
+ the subject, \A can never match. The difference between \Z and \z is
|
|
that \Z matches before a newline at the end of the string as well as at
|
|
the very end, whereas \z matches only at the end.
|
|
|
|
- The \G assertion is true only when the current matching position is at
|
|
- the start point of the match, as specified by the startoffset argument
|
|
- of pcre_exec(). It differs from \A when the value of startoffset is
|
|
- non-zero. By calling pcre_exec() multiple times with appropriate argu-
|
|
+ The \G assertion is true only when the current matching position is at
|
|
+ the start point of the match, as specified by the startoffset argument
|
|
+ of pcre_exec(). It differs from \A when the value of startoffset is
|
|
+ non-zero. By calling pcre_exec() multiple times with appropriate argu-
|
|
ments, you can mimic Perl's /g option, and it is in this kind of imple-
|
|
mentation where \G can be useful.
|
|
|
|
- Note, however, that PCRE's interpretation of \G, as the start of the
|
|
+ Note, however, that PCRE's interpretation of \G, as the start of the
|
|
current match, is subtly different from Perl's, which defines it as the
|
|
- end of the previous match. In Perl, these can be different when the
|
|
- previously matched string was empty. Because PCRE does just one match
|
|
+ end of the previous match. In Perl, these can be different when the
|
|
+ previously matched string was empty. Because PCRE does just one match
|
|
at a time, it cannot reproduce this behaviour.
|
|
|
|
- If all the alternatives of a pattern begin with \G, the expression is
|
|
+ If all the alternatives of a pattern begin with \G, the expression is
|
|
anchored to the starting match position, and the "anchored" flag is set
|
|
in the compiled regular expression.
|
|
|
|
|
|
CIRCUMFLEX AND DOLLAR
|
|
|
|
- The circumflex and dollar metacharacters are zero-width assertions.
|
|
- That is, they test for a particular condition being true without con-
|
|
+ The circumflex and dollar metacharacters are zero-width assertions.
|
|
+ That is, they test for a particular condition being true without con-
|
|
suming any characters from the subject string.
|
|
|
|
Outside a character class, in the default matching mode, the circumflex
|
|
- character is an assertion that is true only if the current matching
|
|
- point is at the start of the subject string. If the startoffset argu-
|
|
- ment of pcre_exec() is non-zero, circumflex can never match if the
|
|
- PCRE_MULTILINE option is unset. Inside a character class, circumflex
|
|
+ character is an assertion that is true only if the current matching
|
|
+ point is at the start of the subject string. If the startoffset argu-
|
|
+ ment of pcre_exec() is non-zero, circumflex can never match if the
|
|
+ PCRE_MULTILINE option is unset. Inside a character class, circumflex
|
|
has an entirely different meaning (see below).
|
|
|
|
- Circumflex need not be the first character of the pattern if a number
|
|
- of alternatives are involved, but it should be the first thing in each
|
|
- alternative in which it appears if the pattern is ever to match that
|
|
- branch. If all possible alternatives start with a circumflex, that is,
|
|
- if the pattern is constrained to match only at the start of the sub-
|
|
- ject, it is said to be an "anchored" pattern. (There are also other
|
|
+ Circumflex need not be the first character of the pattern if a number
|
|
+ of alternatives are involved, but it should be the first thing in each
|
|
+ alternative in which it appears if the pattern is ever to match that
|
|
+ branch. If all possible alternatives start with a circumflex, that is,
|
|
+ if the pattern is constrained to match only at the start of the sub-
|
|
+ ject, it is said to be an "anchored" pattern. (There are also other
|
|
constructs that can cause a pattern to be anchored.)
|
|
|
|
- The dollar character is an assertion that is true only if the current
|
|
- matching point is at the end of the subject string, or immediately
|
|
- before a newline at the end of the string (by default). Note, however,
|
|
- that it does not actually match the newline. Dollar need not be the
|
|
+ The dollar character is an assertion that is true only if the current
|
|
+ matching point is at the end of the subject string, or immediately
|
|
+ before a newline at the end of the string (by default). Note, however,
|
|
+ that it does not actually match the newline. Dollar need not be the
|
|
last character of the pattern if a number of alternatives are involved,
|
|
- but it should be the last item in any branch in which it appears. Dol-
|
|
+ but it should be the last item in any branch in which it appears. Dol-
|
|
lar has no special meaning in a character class.
|
|
|
|
- The meaning of dollar can be changed so that it matches only at the
|
|
- very end of the string, by setting the PCRE_DOLLAR_ENDONLY option at
|
|
+ The meaning of dollar can be changed so that it matches only at the
|
|
+ very end of the string, by setting the PCRE_DOLLAR_ENDONLY option at
|
|
compile time. This does not affect the \Z assertion.
|
|
|
|
The meanings of the circumflex and dollar characters are changed if the
|
|
- PCRE_MULTILINE option is set. When this is the case, a circumflex
|
|
- matches immediately after internal newlines as well as at the start of
|
|
- the subject string. It does not match after a newline that ends the
|
|
- string. A dollar matches before any newlines in the string, as well as
|
|
- at the very end, when PCRE_MULTILINE is set. When newline is specified
|
|
- as the two-character sequence CRLF, isolated CR and LF characters do
|
|
+ PCRE_MULTILINE option is set. When this is the case, a circumflex
|
|
+ matches immediately after internal newlines as well as at the start of
|
|
+ the subject string. It does not match after a newline that ends the
|
|
+ string. A dollar matches before any newlines in the string, as well as
|
|
+ at the very end, when PCRE_MULTILINE is set. When newline is specified
|
|
+ as the two-character sequence CRLF, isolated CR and LF characters do
|
|
not indicate newlines.
|
|
|
|
- For example, the pattern /^abc$/ matches the subject string "def\nabc"
|
|
- (where \n represents a newline) in multiline mode, but not otherwise.
|
|
- Consequently, patterns that are anchored in single line mode because
|
|
- all branches start with ^ are not anchored in multiline mode, and a
|
|
- match for circumflex is possible when the startoffset argument of
|
|
- pcre_exec() is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if
|
|
+ For example, the pattern /^abc$/ matches the subject string "def\nabc"
|
|
+ (where \n represents a newline) in multiline mode, but not otherwise.
|
|
+ Consequently, patterns that are anchored in single line mode because
|
|
+ all branches start with ^ are not anchored in multiline mode, and a
|
|
+ match for circumflex is possible when the startoffset argument of
|
|
+ pcre_exec() is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if
|
|
PCRE_MULTILINE is set.
|
|
|
|
- Note that the sequences \A, \Z, and \z can be used to match the start
|
|
- and end of the subject in both modes, and if all branches of a pattern
|
|
- start with \A it is always anchored, whether or not PCRE_MULTILINE is
|
|
+ Note that the sequences \A, \Z, and \z can be used to match the start
|
|
+ and end of the subject in both modes, and if all branches of a pattern
|
|
+ start with \A it is always anchored, whether or not PCRE_MULTILINE is
|
|
set.
|
|
|
|
|
|
FULL STOP (PERIOD, DOT) AND \N
|
|
|
|
Outside a character class, a dot in the pattern matches any one charac-
|
|
- ter in the subject string except (by default) a character that signi-
|
|
+ ter in the subject string except (by default) a character that signi-
|
|
fies the end of a line.
|
|
|
|
- When a line ending is defined as a single character, dot never matches
|
|
- that character; when the two-character sequence CRLF is used, dot does
|
|
- not match CR if it is immediately followed by LF, but otherwise it
|
|
- matches all characters (including isolated CRs and LFs). When any Uni-
|
|
- code line endings are being recognized, dot does not match CR or LF or
|
|
+ When a line ending is defined as a single character, dot never matches
|
|
+ that character; when the two-character sequence CRLF is used, dot does
|
|
+ not match CR if it is immediately followed by LF, but otherwise it
|
|
+ matches all characters (including isolated CRs and LFs). When any Uni-
|
|
+ code line endings are being recognized, dot does not match CR or LF or
|
|
any of the other line ending characters.
|
|
|
|
- The behaviour of dot with regard to newlines can be changed. If the
|
|
- PCRE_DOTALL option is set, a dot matches any one character, without
|
|
+ The behaviour of dot with regard to newlines can be changed. If the
|
|
+ PCRE_DOTALL option is set, a dot matches any one character, without
|
|
exception. If the two-character sequence CRLF is present in the subject
|
|
string, it takes two dots to match it.
|
|
|
|
- The handling of dot is entirely independent of the handling of circum-
|
|
- flex and dollar, the only relationship being that they both involve
|
|
+ The handling of dot is entirely independent of the handling of circum-
|
|
+ flex and dollar, the only relationship being that they both involve
|
|
newlines. Dot has no special meaning in a character class.
|
|
|
|
- The escape sequence \N behaves like a dot, except that it is not
|
|
- affected by the PCRE_DOTALL option. In other words, it matches any
|
|
- character except one that signifies the end of a line. Perl also uses
|
|
+ The escape sequence \N behaves like a dot, except that it is not
|
|
+ affected by the PCRE_DOTALL option. In other words, it matches any
|
|
+ character except one that signifies the end of a line. Perl also uses
|
|
\N to match characters by name; PCRE does not support this.
|
|
|
|
|
|
MATCHING A SINGLE DATA UNIT
|
|
|
|
- Outside a character class, the escape sequence \C matches any one data
|
|
- unit, whether or not a UTF mode is set. In the 8-bit library, one data
|
|
- unit is one byte; in the 16-bit library it is a 16-bit unit; in the
|
|
- 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches
|
|
- line-ending characters. The feature is provided in Perl in order to
|
|
+ Outside a character class, the escape sequence \C matches any one data
|
|
+ unit, whether or not a UTF mode is set. In the 8-bit library, one data
|
|
+ unit is one byte; in the 16-bit library it is a 16-bit unit; in the
|
|
+ 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches
|
|
+ line-ending characters. The feature is provided in Perl in order to
|
|
match individual bytes in UTF-8 mode, but it is unclear how it can use-
|
|
- fully be used. Because \C breaks up characters into individual data
|
|
- units, matching one unit with \C in a UTF mode means that the rest of
|
|
+ fully be used. Because \C breaks up characters into individual data
|
|
+ units, matching one unit with \C in a UTF mode means that the rest of
|
|
the string may start with a malformed UTF character. This has undefined
|
|
results, because PCRE assumes that it is dealing with valid UTF strings
|
|
- (and by default it checks this at the start of processing unless the
|
|
- PCRE_NO_UTF8_CHECK, PCRE_NO_UTF16_CHECK or PCRE_NO_UTF32_CHECK option
|
|
+ (and by default it checks this at the start of processing unless the
|
|
+ PCRE_NO_UTF8_CHECK, PCRE_NO_UTF16_CHECK or PCRE_NO_UTF32_CHECK option
|
|
is used).
|
|
|
|
- PCRE does not allow \C to appear in lookbehind assertions (described
|
|
- below) in a UTF mode, because this would make it impossible to calcu-
|
|
+ PCRE does not allow \C to appear in lookbehind assertions (described
|
|
+ below) in a UTF mode, because this would make it impossible to calcu-
|
|
late the length of the lookbehind.
|
|
|
|
In general, the \C escape sequence is best avoided. However, one way of
|
|
- using it that avoids the problem of malformed UTF characters is to use
|
|
- a lookahead to check the length of the next character, as in this pat-
|
|
- tern, which could be used with a UTF-8 string (ignore white space and
|
|
+ using it that avoids the problem of malformed UTF characters is to use
|
|
+ a lookahead to check the length of the next character, as in this pat-
|
|
+ tern, which could be used with a UTF-8 string (ignore white space and
|
|
line breaks):
|
|
|
|
(?| (?=[\x00-\x7f])(\C) |
|
|
@@ -5713,11 +5737,11 @@ MATCHING A SINGLE DATA UNIT
|
|
(?=[\x{800}-\x{ffff}])(\C)(\C)(\C) |
|
|
(?=[\x{10000}-\x{1fffff}])(\C)(\C)(\C)(\C))
|
|
|
|
- A group that starts with (?| resets the capturing parentheses numbers
|
|
- in each alternative (see "Duplicate Subpattern Numbers" below). The
|
|
- assertions at the start of each branch check the next UTF-8 character
|
|
- for values whose encoding uses 1, 2, 3, or 4 bytes, respectively. The
|
|
- character's individual bytes are then captured by the appropriate num-
|
|
+ A group that starts with (?| resets the capturing parentheses numbers
|
|
+ in each alternative (see "Duplicate Subpattern Numbers" below). The
|
|
+ assertions at the start of each branch check the next UTF-8 character
|
|
+ for values whose encoding uses 1, 2, 3, or 4 bytes, respectively. The
|
|
+ character's individual bytes are then captured by the appropriate num-
|
|
ber of groups.
|
|
|
|
|
|
@@ -5727,109 +5751,109 @@ SQUARE BRACKETS AND CHARACTER CLASSES
|
|
closing square bracket. A closing square bracket on its own is not spe-
|
|
cial by default. However, if the PCRE_JAVASCRIPT_COMPAT option is set,
|
|
a lone closing square bracket causes a compile-time error. If a closing
|
|
- square bracket is required as a member of the class, it should be the
|
|
- first data character in the class (after an initial circumflex, if
|
|
+ square bracket is required as a member of the class, it should be the
|
|
+ first data character in the class (after an initial circumflex, if
|
|
present) or escaped with a backslash.
|
|
|
|
- A character class matches a single character in the subject. In a UTF
|
|
- mode, the character may be more than one data unit long. A matched
|
|
+ A character class matches a single character in the subject. In a UTF
|
|
+ mode, the character may be more than one data unit long. A matched
|
|
character must be in the set of characters defined by the class, unless
|
|
- the first character in the class definition is a circumflex, in which
|
|
+ the first character in the class definition is a circumflex, in which
|
|
case the subject character must not be in the set defined by the class.
|
|
- If a circumflex is actually required as a member of the class, ensure
|
|
+ If a circumflex is actually required as a member of the class, ensure
|
|
it is not the first character, or escape it with a backslash.
|
|
|
|
- For example, the character class [aeiou] matches any lower case vowel,
|
|
- while [^aeiou] matches any character that is not a lower case vowel.
|
|
+ For example, the character class [aeiou] matches any lower case vowel,
|
|
+ while [^aeiou] matches any character that is not a lower case vowel.
|
|
Note that a circumflex is just a convenient notation for specifying the
|
|
- characters that are in the class by enumerating those that are not. A
|
|
- class that starts with a circumflex is not an assertion; it still con-
|
|
- sumes a character from the subject string, and therefore it fails if
|
|
+ characters that are in the class by enumerating those that are not. A
|
|
+ class that starts with a circumflex is not an assertion; it still con-
|
|
+ sumes a character from the subject string, and therefore it fails if
|
|
the current pointer is at the end of the string.
|
|
|
|
In UTF-8 (UTF-16, UTF-32) mode, characters with values greater than 255
|
|
- (0xffff) can be included in a class as a literal string of data units,
|
|
+ (0xffff) can be included in a class as a literal string of data units,
|
|
or by using the \x{ escaping mechanism.
|
|
|
|
- When caseless matching is set, any letters in a class represent both
|
|
- their upper case and lower case versions, so for example, a caseless
|
|
- [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
|
|
- match "A", whereas a caseful version would. In a UTF mode, PCRE always
|
|
- understands the concept of case for characters whose values are less
|
|
- than 128, so caseless matching is always possible. For characters with
|
|
- higher values, the concept of case is supported if PCRE is compiled
|
|
- with Unicode property support, but not otherwise. If you want to use
|
|
- caseless matching in a UTF mode for characters 128 and above, you must
|
|
- ensure that PCRE is compiled with Unicode property support as well as
|
|
+ When caseless matching is set, any letters in a class represent both
|
|
+ their upper case and lower case versions, so for example, a caseless
|
|
+ [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
|
|
+ match "A", whereas a caseful version would. In a UTF mode, PCRE always
|
|
+ understands the concept of case for characters whose values are less
|
|
+ than 128, so caseless matching is always possible. For characters with
|
|
+ higher values, the concept of case is supported if PCRE is compiled
|
|
+ with Unicode property support, but not otherwise. If you want to use
|
|
+ caseless matching in a UTF mode for characters 128 and above, you must
|
|
+ ensure that PCRE is compiled with Unicode property support as well as
|
|
with UTF support.
|
|
|
|
- Characters that might indicate line breaks are never treated in any
|
|
- special way when matching character classes, whatever line-ending
|
|
- sequence is in use, and whatever setting of the PCRE_DOTALL and
|
|
+ Characters that might indicate line breaks are never treated in any
|
|
+ special way when matching character classes, whatever line-ending
|
|
+ sequence is in use, and whatever setting of the PCRE_DOTALL and
|
|
PCRE_MULTILINE options is used. A class such as [^a] always matches one
|
|
of these characters.
|
|
|
|
- The minus (hyphen) character can be used to specify a range of charac-
|
|
- ters in a character class. For example, [d-m] matches any letter
|
|
- between d and m, inclusive. If a minus character is required in a
|
|
- class, it must be escaped with a backslash or appear in a position
|
|
- where it cannot be interpreted as indicating a range, typically as the
|
|
+ The minus (hyphen) character can be used to specify a range of charac-
|
|
+ ters in a character class. For example, [d-m] matches any letter
|
|
+ between d and m, inclusive. If a minus character is required in a
|
|
+ class, it must be escaped with a backslash or appear in a position
|
|
+ where it cannot be interpreted as indicating a range, typically as the
|
|
first or last character in the class, or immediately after a range. For
|
|
- example, [b-d-z] matches letters in the range b to d, a hyphen charac-
|
|
+ example, [b-d-z] matches letters in the range b to d, a hyphen charac-
|
|
ter, or z.
|
|
|
|
It is not possible to have the literal character "]" as the end charac-
|
|
- ter of a range. A pattern such as [W-]46] is interpreted as a class of
|
|
- two characters ("W" and "-") followed by a literal string "46]", so it
|
|
- would match "W46]" or "-46]". However, if the "]" is escaped with a
|
|
- backslash it is interpreted as the end of range, so [W-\]46] is inter-
|
|
- preted as a class containing a range followed by two other characters.
|
|
- The octal or hexadecimal representation of "]" can also be used to end
|
|
+ ter of a range. A pattern such as [W-]46] is interpreted as a class of
|
|
+ two characters ("W" and "-") followed by a literal string "46]", so it
|
|
+ would match "W46]" or "-46]". However, if the "]" is escaped with a
|
|
+ backslash it is interpreted as the end of range, so [W-\]46] is inter-
|
|
+ preted as a class containing a range followed by two other characters.
|
|
+ The octal or hexadecimal representation of "]" can also be used to end
|
|
a range.
|
|
|
|
- An error is generated if a POSIX character class (see below) or an
|
|
- escape sequence other than one that defines a single character appears
|
|
- at a point where a range ending character is expected. For example,
|
|
+ An error is generated if a POSIX character class (see below) or an
|
|
+ escape sequence other than one that defines a single character appears
|
|
+ at a point where a range ending character is expected. For example,
|
|
[z-\xff] is valid, but [A-\d] and [A-[:digit:]] are not.
|
|
|
|
- Ranges operate in the collating sequence of character values. They can
|
|
- also be used for characters specified numerically, for example
|
|
- [\000-\037]. Ranges can include any characters that are valid for the
|
|
+ Ranges operate in the collating sequence of character values. They can
|
|
+ also be used for characters specified numerically, for example
|
|
+ [\000-\037]. Ranges can include any characters that are valid for the
|
|
current mode.
|
|
|
|
If a range that includes letters is used when caseless matching is set,
|
|
it matches the letters in either case. For example, [W-c] is equivalent
|
|
- to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if
|
|
- character tables for a French locale are in use, [\xc8-\xcb] matches
|
|
- accented E characters in both cases. In UTF modes, PCRE supports the
|
|
- concept of case for characters with values greater than 128 only when
|
|
+ to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if
|
|
+ character tables for a French locale are in use, [\xc8-\xcb] matches
|
|
+ accented E characters in both cases. In UTF modes, PCRE supports the
|
|
+ concept of case for characters with values greater than 128 only when
|
|
it is compiled with Unicode property support.
|
|
|
|
- The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v, \V,
|
|
+ The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v, \V,
|
|
\w, and \W may appear in a character class, and add the characters that
|
|
- they match to the class. For example, [\dABCDEF] matches any hexadeci-
|
|
- mal digit. In UTF modes, the PCRE_UCP option affects the meanings of
|
|
- \d, \s, \w and their upper case partners, just as it does when they
|
|
- appear outside a character class, as described in the section entitled
|
|
+ they match to the class. For example, [\dABCDEF] matches any hexadeci-
|
|
+ mal digit. In UTF modes, the PCRE_UCP option affects the meanings of
|
|
+ \d, \s, \w and their upper case partners, just as it does when they
|
|
+ appear outside a character class, as described in the section entitled
|
|
"Generic character types" above. The escape sequence \b has a different
|
|
- meaning inside a character class; it matches the backspace character.
|
|
- The sequences \B, \N, \R, and \X are not special inside a character
|
|
- class. Like any other unrecognized escape sequences, they are treated
|
|
- as the literal characters "B", "N", "R", and "X" by default, but cause
|
|
+ meaning inside a character class; it matches the backspace character.
|
|
+ The sequences \B, \N, \R, and \X are not special inside a character
|
|
+ class. Like any other unrecognized escape sequences, they are treated
|
|
+ as the literal characters "B", "N", "R", and "X" by default, but cause
|
|
an error if the PCRE_EXTRA option is set.
|
|
|
|
- A circumflex can conveniently be used with the upper case character
|
|
- types to specify a more restricted set of characters than the matching
|
|
- lower case type. For example, the class [^\W_] matches any letter or
|
|
+ A circumflex can conveniently be used with the upper case character
|
|
+ types to specify a more restricted set of characters than the matching
|
|
+ lower case type. For example, the class [^\W_] matches any letter or
|
|
digit, but not underscore, whereas [\w] includes underscore. A positive
|
|
character class should be read as "something OR something OR ..." and a
|
|
negative class as "NOT something AND NOT something AND NOT ...".
|
|
|
|
- The only metacharacters that are recognized in character classes are
|
|
- backslash, hyphen (only where it can be interpreted as specifying a
|
|
- range), circumflex (only at the start), opening square bracket (only
|
|
- when it can be interpreted as introducing a POSIX class name, or for a
|
|
- special compatibility feature - see the next two sections), and the
|
|
+ The only metacharacters that are recognized in character classes are
|
|
+ backslash, hyphen (only where it can be interpreted as specifying a
|
|
+ range), circumflex (only at the start), opening square bracket (only
|
|
+ when it can be interpreted as introducing a POSIX class name, or for a
|
|
+ special compatibility feature - see the next two sections), and the
|
|
terminating closing square bracket. However, escaping other non-
|
|
alphanumeric characters does no harm.
|
|
|
|
@@ -5837,7 +5861,7 @@ SQUARE BRACKETS AND CHARACTER CLASSES
|
|
POSIX CHARACTER CLASSES
|
|
|
|
Perl supports the POSIX notation for character classes. This uses names
|
|
- enclosed by [: and :] within the enclosing square brackets. PCRE also
|
|
+ enclosed by [: and :] within the enclosing square brackets. PCRE also
|
|
supports this notation. For example,
|
|
|
|
[01[:alpha:]%]
|
|
@@ -5860,28 +5884,28 @@ POSIX CHARACTER CLASSES
|
|
word "word" characters (same as \w)
|
|
xdigit hexadecimal digits
|
|
|
|
- The default "space" characters are HT (9), LF (10), VT (11), FF (12),
|
|
- CR (13), and space (32). If locale-specific matching is taking place,
|
|
- the list of space characters may be different; there may be fewer or
|
|
+ The default "space" characters are HT (9), LF (10), VT (11), FF (12),
|
|
+ CR (13), and space (32). If locale-specific matching is taking place,
|
|
+ the list of space characters may be different; there may be fewer or
|
|
more of them. "Space" used to be different to \s, which did not include
|
|
VT, for Perl compatibility. However, Perl changed at release 5.18, and
|
|
- PCRE followed at release 8.34. "Space" and \s now match the same set
|
|
+ PCRE followed at release 8.34. "Space" and \s now match the same set
|
|
of characters.
|
|
|
|
- The name "word" is a Perl extension, and "blank" is a GNU extension
|
|
- from Perl 5.8. Another Perl extension is negation, which is indicated
|
|
+ The name "word" is a Perl extension, and "blank" is a GNU extension
|
|
+ from Perl 5.8. Another Perl extension is negation, which is indicated
|
|
by a ^ character after the colon. For example,
|
|
|
|
[12[:^digit:]]
|
|
|
|
- matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the
|
|
+ matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the
|
|
POSIX syntax [.ch.] and [=ch=] where "ch" is a "collating element", but
|
|
these are not supported, and an error is given if they are encountered.
|
|
|
|
By default, characters with values greater than 128 do not match any of
|
|
- the POSIX character classes. However, if the PCRE_UCP option is passed
|
|
- to pcre_compile(), some of the classes are changed so that Unicode
|
|
- character properties are used. This is achieved by replacing certain
|
|
+ the POSIX character classes. However, if the PCRE_UCP option is passed
|
|
+ to pcre_compile(), some of the classes are changed so that Unicode
|
|
+ character properties are used. This is achieved by replacing certain
|
|
POSIX classes by other sequences, as follows:
|
|
|
|
[:alnum:] becomes \p{Xan}
|
|
@@ -5893,10 +5917,10 @@ POSIX CHARACTER CLASSES
|
|
[:upper:] becomes \p{Lu}
|
|
[:word:] becomes \p{Xwd}
|
|
|
|
- Negated versions, such as [:^alpha:] use \P instead of \p. Three other
|
|
+ Negated versions, such as [:^alpha:] use \P instead of \p. Three other
|
|
POSIX classes are handled specially in UCP mode:
|
|
|
|
- [:graph:] This matches characters that have glyphs that mark the page
|
|
+ [:graph:] This matches characters that have glyphs that mark the page
|
|
when printed. In Unicode property terms, it matches all char-
|
|
acters with the L, M, N, P, S, or Cf properties, except for:
|
|
|
|
@@ -5905,58 +5929,58 @@ POSIX CHARACTER CLASSES
|
|
U+2066 - U+2069 Various "isolate"s
|
|
|
|
|
|
- [:print:] This matches the same characters as [:graph:] plus space
|
|
- characters that are not controls, that is, characters with
|
|
+ [:print:] This matches the same characters as [:graph:] plus space
|
|
+ characters that are not controls, that is, characters with
|
|
the Zs property.
|
|
|
|
[:punct:] This matches all characters that have the Unicode P (punctua-
|
|
- tion) property, plus those characters whose code points are
|
|
+ tion) property, plus those characters whose code points are
|
|
less than 128 that have the S (Symbol) property.
|
|
|
|
- The other POSIX classes are unchanged, and match only characters with
|
|
+ The other POSIX classes are unchanged, and match only characters with
|
|
code points less than 128.
|
|
|
|
|
|
COMPATIBILITY FEATURE FOR WORD BOUNDARIES
|
|
|
|
- In the POSIX.2 compliant library that was included in 4.4BSD Unix, the
|
|
- ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word"
|
|
+ In the POSIX.2 compliant library that was included in 4.4BSD Unix, the
|
|
+ ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word"
|
|
and "end of word". PCRE treats these items as follows:
|
|
|
|
[[:<:]] is converted to \b(?=\w)
|
|
[[:>:]] is converted to \b(?<=\w)
|
|
|
|
Only these exact character sequences are recognized. A sequence such as
|
|
- [a[:<:]b] provokes error for an unrecognized POSIX class name. This
|
|
- support is not compatible with Perl. It is provided to help migrations
|
|
+ [a[:<:]b] provokes error for an unrecognized POSIX class name. This
|
|
+ support is not compatible with Perl. It is provided to help migrations
|
|
from other environments, and is best not used in any new patterns. Note
|
|
- that \b matches at the start and the end of a word (see "Simple asser-
|
|
- tions" above), and in a Perl-style pattern the preceding or following
|
|
- character normally shows which is wanted, without the need for the
|
|
- assertions that are used above in order to give exactly the POSIX be-
|
|
+ that \b matches at the start and the end of a word (see "Simple asser-
|
|
+ tions" above), and in a Perl-style pattern the preceding or following
|
|
+ character normally shows which is wanted, without the need for the
|
|
+ assertions that are used above in order to give exactly the POSIX be-
|
|
haviour.
|
|
|
|
|
|
VERTICAL BAR
|
|
|
|
- Vertical bar characters are used to separate alternative patterns. For
|
|
+ Vertical bar characters are used to separate alternative patterns. For
|
|
example, the pattern
|
|
|
|
gilbert|sullivan
|
|
|
|
- matches either "gilbert" or "sullivan". Any number of alternatives may
|
|
- appear, and an empty alternative is permitted (matching the empty
|
|
+ matches either "gilbert" or "sullivan". Any number of alternatives may
|
|
+ appear, and an empty alternative is permitted (matching the empty
|
|
string). The matching process tries each alternative in turn, from left
|
|
- to right, and the first one that succeeds is used. If the alternatives
|
|
- are within a subpattern (defined below), "succeeds" means matching the
|
|
+ to right, and the first one that succeeds is used. If the alternatives
|
|
+ are within a subpattern (defined below), "succeeds" means matching the
|
|
rest of the main pattern as well as the alternative in the subpattern.
|
|
|
|
|
|
INTERNAL OPTION SETTING
|
|
|
|
- The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and
|
|
- PCRE_EXTENDED options (which are Perl-compatible) can be changed from
|
|
- within the pattern by a sequence of Perl option letters enclosed
|
|
+ The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and
|
|
+ PCRE_EXTENDED options (which are Perl-compatible) can be changed from
|
|
+ within the pattern by a sequence of Perl option letters enclosed
|
|
between "(?" and ")". The option letters are
|
|
|
|
i for PCRE_CASELESS
|
|
@@ -5966,51 +5990,51 @@ INTERNAL OPTION SETTING
|
|
|
|
For example, (?im) sets caseless, multiline matching. It is also possi-
|
|
ble to unset these options by preceding the letter with a hyphen, and a
|
|
- combined setting and unsetting such as (?im-sx), which sets PCRE_CASE-
|
|
- LESS and PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED,
|
|
- is also permitted. If a letter appears both before and after the
|
|
+ combined setting and unsetting such as (?im-sx), which sets PCRE_CASE-
|
|
+ LESS and PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED,
|
|
+ is also permitted. If a letter appears both before and after the
|
|
hyphen, the option is unset.
|
|
|
|
- The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA
|
|
- can be changed in the same way as the Perl-compatible options by using
|
|
+ The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA
|
|
+ can be changed in the same way as the Perl-compatible options by using
|
|
the characters J, U and X respectively.
|
|
|
|
- When one of these option changes occurs at top level (that is, not
|
|
- inside subpattern parentheses), the change applies to the remainder of
|
|
+ When one of these option changes occurs at top level (that is, not
|
|
+ inside subpattern parentheses), the change applies to the remainder of
|
|
the pattern that follows. If the change is placed right at the start of
|
|
a pattern, PCRE extracts it into the global options (and it will there-
|
|
fore show up in data extracted by the pcre_fullinfo() function).
|
|
|
|
- An option change within a subpattern (see below for a description of
|
|
- subpatterns) affects only that part of the subpattern that follows it,
|
|
+ An option change within a subpattern (see below for a description of
|
|
+ subpatterns) affects only that part of the subpattern that follows it,
|
|
so
|
|
|
|
(a(?i)b)c
|
|
|
|
matches abc and aBc and no other strings (assuming PCRE_CASELESS is not
|
|
- used). By this means, options can be made to have different settings
|
|
- in different parts of the pattern. Any changes made in one alternative
|
|
- do carry on into subsequent branches within the same subpattern. For
|
|
+ used). By this means, options can be made to have different settings
|
|
+ in different parts of the pattern. Any changes made in one alternative
|
|
+ do carry on into subsequent branches within the same subpattern. For
|
|
example,
|
|
|
|
(a(?i)b|c)
|
|
|
|
- matches "ab", "aB", "c", and "C", even though when matching "C" the
|
|
- first branch is abandoned before the option setting. This is because
|
|
- the effects of option settings happen at compile time. There would be
|
|
+ matches "ab", "aB", "c", and "C", even though when matching "C" the
|
|
+ first branch is abandoned before the option setting. This is because
|
|
+ the effects of option settings happen at compile time. There would be
|
|
some very weird behaviour otherwise.
|
|
|
|
- Note: There are other PCRE-specific options that can be set by the
|
|
- application when the compiling or matching functions are called. In
|
|
- some cases the pattern can contain special leading sequences such as
|
|
- (*CRLF) to override what the application has set or what has been
|
|
- defaulted. Details are given in the section entitled "Newline
|
|
- sequences" above. There are also the (*UTF8), (*UTF16),(*UTF32), and
|
|
- (*UCP) leading sequences that can be used to set UTF and Unicode prop-
|
|
- erty modes; they are equivalent to setting the PCRE_UTF8, PCRE_UTF16,
|
|
- PCRE_UTF32 and the PCRE_UCP options, respectively. The (*UTF) sequence
|
|
- is a generic version that can be used with any of the libraries. How-
|
|
- ever, the application can set the PCRE_NEVER_UTF option, which locks
|
|
+ Note: There are other PCRE-specific options that can be set by the
|
|
+ application when the compiling or matching functions are called. In
|
|
+ some cases the pattern can contain special leading sequences such as
|
|
+ (*CRLF) to override what the application has set or what has been
|
|
+ defaulted. Details are given in the section entitled "Newline
|
|
+ sequences" above. There are also the (*UTF8), (*UTF16),(*UTF32), and
|
|
+ (*UCP) leading sequences that can be used to set UTF and Unicode prop-
|
|
+ erty modes; they are equivalent to setting the PCRE_UTF8, PCRE_UTF16,
|
|
+ PCRE_UTF32 and the PCRE_UCP options, respectively. The (*UTF) sequence
|
|
+ is a generic version that can be used with any of the libraries. How-
|
|
+ ever, the application can set the PCRE_NEVER_UTF option, which locks
|
|
out the use of the (*UTF) sequences.
|
|
|
|
|
|
@@ -6023,18 +6047,18 @@ SUBPATTERNS
|
|
|
|
cat(aract|erpillar|)
|
|
|
|
- matches "cataract", "caterpillar", or "cat". Without the parentheses,
|
|
+ matches "cataract", "caterpillar", or "cat". Without the parentheses,
|
|
it would match "cataract", "erpillar" or an empty string.
|
|
|
|
- 2. It sets up the subpattern as a capturing subpattern. This means
|
|
- that, when the whole pattern matches, that portion of the subject
|
|
+ 2. It sets up the subpattern as a capturing subpattern. This means
|
|
+ that, when the whole pattern matches, that portion of the subject
|
|
string that matched the subpattern is passed back to the caller via the
|
|
- ovector argument of the matching function. (This applies only to the
|
|
- traditional matching functions; the DFA matching functions do not sup-
|
|
+ ovector argument of the matching function. (This applies only to the
|
|
+ traditional matching functions; the DFA matching functions do not sup-
|
|
port capturing.)
|
|
|
|
Opening parentheses are counted from left to right (starting from 1) to
|
|
- obtain numbers for the capturing subpatterns. For example, if the
|
|
+ obtain numbers for the capturing subpatterns. For example, if the
|
|
string "the red king" is matched against the pattern
|
|
|
|
the ((red|white) (king|queen))
|
|
@@ -6042,12 +6066,12 @@ SUBPATTERNS
|
|
the captured substrings are "red king", "red", and "king", and are num-
|
|
bered 1, 2, and 3, respectively.
|
|
|
|
- The fact that plain parentheses fulfil two functions is not always
|
|
- helpful. There are often times when a grouping subpattern is required
|
|
- without a capturing requirement. If an opening parenthesis is followed
|
|
- by a question mark and a colon, the subpattern does not do any captur-
|
|
- ing, and is not counted when computing the number of any subsequent
|
|
- capturing subpatterns. For example, if the string "the white queen" is
|
|
+ The fact that plain parentheses fulfil two functions is not always
|
|
+ helpful. There are often times when a grouping subpattern is required
|
|
+ without a capturing requirement. If an opening parenthesis is followed
|
|
+ by a question mark and a colon, the subpattern does not do any captur-
|
|
+ ing, and is not counted when computing the number of any subsequent
|
|
+ capturing subpatterns. For example, if the string "the white queen" is
|
|
matched against the pattern
|
|
|
|
the ((?:red|white) (king|queen))
|
|
@@ -6055,37 +6079,37 @@ SUBPATTERNS
|
|
the captured substrings are "white queen" and "queen", and are numbered
|
|
1 and 2. The maximum number of capturing subpatterns is 65535.
|
|
|
|
- As a convenient shorthand, if any option settings are required at the
|
|
- start of a non-capturing subpattern, the option letters may appear
|
|
+ As a convenient shorthand, if any option settings are required at the
|
|
+ start of a non-capturing subpattern, the option letters may appear
|
|
between the "?" and the ":". Thus the two patterns
|
|
|
|
(?i:saturday|sunday)
|
|
(?:(?i)saturday|sunday)
|
|
|
|
match exactly the same set of strings. Because alternative branches are
|
|
- tried from left to right, and options are not reset until the end of
|
|
- the subpattern is reached, an option setting in one branch does affect
|
|
- subsequent branches, so the above patterns match "SUNDAY" as well as
|
|
+ tried from left to right, and options are not reset until the end of
|
|
+ the subpattern is reached, an option setting in one branch does affect
|
|
+ subsequent branches, so the above patterns match "SUNDAY" as well as
|
|
"Saturday".
|
|
|
|
|
|
DUPLICATE SUBPATTERN NUMBERS
|
|
|
|
Perl 5.10 introduced a feature whereby each alternative in a subpattern
|
|
- uses the same numbers for its capturing parentheses. Such a subpattern
|
|
- starts with (?| and is itself a non-capturing subpattern. For example,
|
|
+ uses the same numbers for its capturing parentheses. Such a subpattern
|
|
+ starts with (?| and is itself a non-capturing subpattern. For example,
|
|
consider this pattern:
|
|
|
|
(?|(Sat)ur|(Sun))day
|
|
|
|
- Because the two alternatives are inside a (?| group, both sets of cap-
|
|
- turing parentheses are numbered one. Thus, when the pattern matches,
|
|
- you can look at captured substring number one, whichever alternative
|
|
- matched. This construct is useful when you want to capture part, but
|
|
+ Because the two alternatives are inside a (?| group, both sets of cap-
|
|
+ turing parentheses are numbered one. Thus, when the pattern matches,
|
|
+ you can look at captured substring number one, whichever alternative
|
|
+ matched. This construct is useful when you want to capture part, but
|
|
not all, of one of a number of alternatives. Inside a (?| group, paren-
|
|
- theses are numbered as usual, but the number is reset at the start of
|
|
- each branch. The numbers of any capturing parentheses that follow the
|
|
- subpattern start after the highest number used in any branch. The fol-
|
|
+ theses are numbered as usual, but the number is reset at the start of
|
|
+ each branch. The numbers of any capturing parentheses that follow the
|
|
+ subpattern start after the highest number used in any branch. The fol-
|
|
lowing example is taken from the Perl documentation. The numbers under-
|
|
neath show in which buffer the captured content will be stored.
|
|
|
|
@@ -6093,58 +6117,58 @@ DUPLICATE SUBPATTERN NUMBERS
|
|
/ ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
|
|
# 1 2 2 3 2 3 4
|
|
|
|
- A back reference to a numbered subpattern uses the most recent value
|
|
- that is set for that number by any subpattern. The following pattern
|
|
+ A back reference to a numbered subpattern uses the most recent value
|
|
+ that is set for that number by any subpattern. The following pattern
|
|
matches "abcabc" or "defdef":
|
|
|
|
/(?|(abc)|(def))\1/
|
|
|
|
- In contrast, a subroutine call to a numbered subpattern always refers
|
|
- to the first one in the pattern with the given number. The following
|
|
+ In contrast, a subroutine call to a numbered subpattern always refers
|
|
+ to the first one in the pattern with the given number. The following
|
|
pattern matches "abcabc" or "defabc":
|
|
|
|
/(?|(abc)|(def))(?1)/
|
|
|
|
- If a condition test for a subpattern's having matched refers to a non-
|
|
- unique number, the test is true if any of the subpatterns of that num-
|
|
+ If a condition test for a subpattern's having matched refers to a non-
|
|
+ unique number, the test is true if any of the subpatterns of that num-
|
|
ber have matched.
|
|
|
|
- An alternative approach to using this "branch reset" feature is to use
|
|
+ An alternative approach to using this "branch reset" feature is to use
|
|
duplicate named subpatterns, as described in the next section.
|
|
|
|
|
|
NAMED SUBPATTERNS
|
|
|
|
- Identifying capturing parentheses by number is simple, but it can be
|
|
- very hard to keep track of the numbers in complicated regular expres-
|
|
- sions. Furthermore, if an expression is modified, the numbers may
|
|
- change. To help with this difficulty, PCRE supports the naming of sub-
|
|
+ Identifying capturing parentheses by number is simple, but it can be
|
|
+ very hard to keep track of the numbers in complicated regular expres-
|
|
+ sions. Furthermore, if an expression is modified, the numbers may
|
|
+ change. To help with this difficulty, PCRE supports the naming of sub-
|
|
patterns. This feature was not added to Perl until release 5.10. Python
|
|
- had the feature earlier, and PCRE introduced it at release 4.0, using
|
|
- the Python syntax. PCRE now supports both the Perl and the Python syn-
|
|
- tax. Perl allows identically numbered subpatterns to have different
|
|
+ had the feature earlier, and PCRE introduced it at release 4.0, using
|
|
+ the Python syntax. PCRE now supports both the Perl and the Python syn-
|
|
+ tax. Perl allows identically numbered subpatterns to have different
|
|
names, but PCRE does not.
|
|
|
|
- In PCRE, a subpattern can be named in one of three ways: (?<name>...)
|
|
- or (?'name'...) as in Perl, or (?P<name>...) as in Python. References
|
|
- to capturing parentheses from other parts of the pattern, such as back
|
|
- references, recursion, and conditions, can be made by name as well as
|
|
+ In PCRE, a subpattern can be named in one of three ways: (?<name>...)
|
|
+ or (?'name'...) as in Perl, or (?P<name>...) as in Python. References
|
|
+ to capturing parentheses from other parts of the pattern, such as back
|
|
+ references, recursion, and conditions, can be made by name as well as
|
|
by number.
|
|
|
|
- Names consist of up to 32 alphanumeric characters and underscores, but
|
|
- must start with a non-digit. Named capturing parentheses are still
|
|
- allocated numbers as well as names, exactly as if the names were not
|
|
- present. The PCRE API provides function calls for extracting the name-
|
|
- to-number translation table from a compiled pattern. There is also a
|
|
+ Names consist of up to 32 alphanumeric characters and underscores, but
|
|
+ must start with a non-digit. Named capturing parentheses are still
|
|
+ allocated numbers as well as names, exactly as if the names were not
|
|
+ present. The PCRE API provides function calls for extracting the name-
|
|
+ to-number translation table from a compiled pattern. There is also a
|
|
convenience function for extracting a captured substring by name.
|
|
|
|
- By default, a name must be unique within a pattern, but it is possible
|
|
+ By default, a name must be unique within a pattern, but it is possible
|
|
to relax this constraint by setting the PCRE_DUPNAMES option at compile
|
|
- time. (Duplicate names are also always permitted for subpatterns with
|
|
- the same number, set up as described in the previous section.) Dupli-
|
|
- cate names can be useful for patterns where only one instance of the
|
|
- named parentheses can match. Suppose you want to match the name of a
|
|
- weekday, either as a 3-letter abbreviation or as the full name, and in
|
|
+ time. (Duplicate names are also always permitted for subpatterns with
|
|
+ the same number, set up as described in the previous section.) Dupli-
|
|
+ cate names can be useful for patterns where only one instance of the
|
|
+ named parentheses can match. Suppose you want to match the name of a
|
|
+ weekday, either as a 3-letter abbreviation or as the full name, and in
|
|
both cases you want to extract the abbreviation. This pattern (ignoring
|
|
the line breaks) does the job:
|
|
|
|
@@ -6154,18 +6178,18 @@ NAMED SUBPATTERNS
|
|
(?<DN>Thu)(?:rsday)?|
|
|
(?<DN>Sat)(?:urday)?
|
|
|
|
- There are five capturing substrings, but only one is ever set after a
|
|
+ There are five capturing substrings, but only one is ever set after a
|
|
match. (An alternative way of solving this problem is to use a "branch
|
|
reset" subpattern, as described in the previous section.)
|
|
|
|
- The convenience function for extracting the data by name returns the
|
|
- substring for the first (and in this example, the only) subpattern of
|
|
- that name that matched. This saves searching to find which numbered
|
|
+ The convenience function for extracting the data by name returns the
|
|
+ substring for the first (and in this example, the only) subpattern of
|
|
+ that name that matched. This saves searching to find which numbered
|
|
subpattern it was.
|
|
|
|
- If you make a back reference to a non-unique named subpattern from
|
|
- elsewhere in the pattern, the subpatterns to which the name refers are
|
|
- checked in the order in which they appear in the overall pattern. The
|
|
+ If you make a back reference to a non-unique named subpattern from
|
|
+ elsewhere in the pattern, the subpatterns to which the name refers are
|
|
+ checked in the order in which they appear in the overall pattern. The
|
|
first one that is set is used for the reference. For example, this pat-
|
|
tern matches both "foofoo" and "barbar" but not "foobar" or "barfoo":
|
|
|
|
@@ -6173,29 +6197,29 @@ NAMED SUBPATTERNS
|
|
|
|
|
|
If you make a subroutine call to a non-unique named subpattern, the one
|
|
- that corresponds to the first occurrence of the name is used. In the
|
|
+ that corresponds to the first occurrence of the name is used. In the
|
|
absence of duplicate numbers (see the previous section) this is the one
|
|
with the lowest number.
|
|
|
|
If you use a named reference in a condition test (see the section about
|
|
conditions below), either to check whether a subpattern has matched, or
|
|
- to check for recursion, all subpatterns with the same name are tested.
|
|
- If the condition is true for any one of them, the overall condition is
|
|
- true. This is the same behaviour as testing by number. For further
|
|
- details of the interfaces for handling named subpatterns, see the
|
|
+ to check for recursion, all subpatterns with the same name are tested.
|
|
+ If the condition is true for any one of them, the overall condition is
|
|
+ true. This is the same behaviour as testing by number. For further
|
|
+ details of the interfaces for handling named subpatterns, see the
|
|
pcreapi documentation.
|
|
|
|
Warning: You cannot use different names to distinguish between two sub-
|
|
- patterns with the same number because PCRE uses only the numbers when
|
|
+ patterns with the same number because PCRE uses only the numbers when
|
|
matching. For this reason, an error is given at compile time if differ-
|
|
- ent names are given to subpatterns with the same number. However, you
|
|
+ ent names are given to subpatterns with the same number. However, you
|
|
can always give the same name to subpatterns with the same number, even
|
|
when PCRE_DUPNAMES is not set.
|
|
|
|
|
|
REPETITION
|
|
|
|
- Repetition is specified by quantifiers, which can follow any of the
|
|
+ Repetition is specified by quantifiers, which can follow any of the
|
|
following items:
|
|
|
|
a literal data character
|
|
@@ -6209,17 +6233,17 @@ REPETITION
|
|
a parenthesized subpattern (including assertions)
|
|
a subroutine call to a subpattern (recursive or otherwise)
|
|
|
|
- The general repetition quantifier specifies a minimum and maximum num-
|
|
- ber of permitted matches, by giving the two numbers in curly brackets
|
|
- (braces), separated by a comma. The numbers must be less than 65536,
|
|
+ The general repetition quantifier specifies a minimum and maximum num-
|
|
+ ber of permitted matches, by giving the two numbers in curly brackets
|
|
+ (braces), separated by a comma. The numbers must be less than 65536,
|
|
and the first must be less than or equal to the second. For example:
|
|
|
|
z{2,4}
|
|
|
|
- matches "zz", "zzz", or "zzzz". A closing brace on its own is not a
|
|
- special character. If the second number is omitted, but the comma is
|
|
- present, there is no upper limit; if the second number and the comma
|
|
- are both omitted, the quantifier specifies an exact number of required
|
|
+ matches "zz", "zzz", or "zzzz". A closing brace on its own is not a
|
|
+ special character. If the second number is omitted, but the comma is
|
|
+ present, there is no upper limit; if the second number and the comma
|
|
+ are both omitted, the quantifier specifies an exact number of required
|
|
matches. Thus
|
|
|
|
[aeiou]{3,}
|
|
@@ -6228,50 +6252,50 @@ REPETITION
|
|
|
|
\d{8}
|
|
|
|
- matches exactly 8 digits. An opening curly bracket that appears in a
|
|
- position where a quantifier is not allowed, or one that does not match
|
|
- the syntax of a quantifier, is taken as a literal character. For exam-
|
|
+ matches exactly 8 digits. An opening curly bracket that appears in a
|
|
+ position where a quantifier is not allowed, or one that does not match
|
|
+ the syntax of a quantifier, is taken as a literal character. For exam-
|
|
ple, {,6} is not a quantifier, but a literal string of four characters.
|
|
|
|
In UTF modes, quantifiers apply to characters rather than to individual
|
|
- data units. Thus, for example, \x{100}{2} matches two characters, each
|
|
+ data units. Thus, for example, \x{100}{2} matches two characters, each
|
|
of which is represented by a two-byte sequence in a UTF-8 string. Simi-
|
|
- larly, \X{3} matches three Unicode extended grapheme clusters, each of
|
|
- which may be several data units long (and they may be of different
|
|
+ larly, \X{3} matches three Unicode extended grapheme clusters, each of
|
|
+ which may be several data units long (and they may be of different
|
|
lengths).
|
|
|
|
The quantifier {0} is permitted, causing the expression to behave as if
|
|
the previous item and the quantifier were not present. This may be use-
|
|
- ful for subpatterns that are referenced as subroutines from elsewhere
|
|
+ ful for subpatterns that are referenced as subroutines from elsewhere
|
|
in the pattern (but see also the section entitled "Defining subpatterns
|
|
- for use by reference only" below). Items other than subpatterns that
|
|
+ for use by reference only" below). Items other than subpatterns that
|
|
have a {0} quantifier are omitted from the compiled pattern.
|
|
|
|
- For convenience, the three most common quantifiers have single-charac-
|
|
+ For convenience, the three most common quantifiers have single-charac-
|
|
ter abbreviations:
|
|
|
|
* is equivalent to {0,}
|
|
+ is equivalent to {1,}
|
|
? is equivalent to {0,1}
|
|
|
|
- It is possible to construct infinite loops by following a subpattern
|
|
+ It is possible to construct infinite loops by following a subpattern
|
|
that can match no characters with a quantifier that has no upper limit,
|
|
for example:
|
|
|
|
(a?)*
|
|
|
|
Earlier versions of Perl and PCRE used to give an error at compile time
|
|
- for such patterns. However, because there are cases where this can be
|
|
- useful, such patterns are now accepted, but if any repetition of the
|
|
- subpattern does in fact match no characters, the loop is forcibly bro-
|
|
+ for such patterns. However, because there are cases where this can be
|
|
+ useful, such patterns are now accepted, but if any repetition of the
|
|
+ subpattern does in fact match no characters, the loop is forcibly bro-
|
|
ken.
|
|
|
|
- By default, the quantifiers are "greedy", that is, they match as much
|
|
- as possible (up to the maximum number of permitted times), without
|
|
- causing the rest of the pattern to fail. The classic example of where
|
|
+ By default, the quantifiers are "greedy", that is, they match as much
|
|
+ as possible (up to the maximum number of permitted times), without
|
|
+ causing the rest of the pattern to fail. The classic example of where
|
|
this gives problems is in trying to match comments in C programs. These
|
|
- appear between /* and */ and within the comment, individual * and /
|
|
- characters may appear. An attempt to match C comments by applying the
|
|
+ appear between /* and */ and within the comment, individual * and /
|
|
+ characters may appear. An attempt to match C comments by applying the
|
|
pattern
|
|
|
|
/\*.*\*/
|
|
@@ -6280,19 +6304,19 @@ REPETITION
|
|
|
|
/* first comment */ not comment /* second comment */
|
|
|
|
- fails, because it matches the entire string owing to the greediness of
|
|
+ fails, because it matches the entire string owing to the greediness of
|
|
the .* item.
|
|
|
|
- However, if a quantifier is followed by a question mark, it ceases to
|
|
+ However, if a quantifier is followed by a question mark, it ceases to
|
|
be greedy, and instead matches the minimum number of times possible, so
|
|
the pattern
|
|
|
|
/\*.*?\*/
|
|
|
|
- does the right thing with the C comments. The meaning of the various
|
|
- quantifiers is not otherwise changed, just the preferred number of
|
|
- matches. Do not confuse this use of question mark with its use as a
|
|
- quantifier in its own right. Because it has two uses, it can sometimes
|
|
+ does the right thing with the C comments. The meaning of the various
|
|
+ quantifiers is not otherwise changed, just the preferred number of
|
|
+ matches. Do not confuse this use of question mark with its use as a
|
|
+ quantifier in its own right. Because it has two uses, it can sometimes
|
|
appear doubled, as in
|
|
|
|
\d??\d
|
|
@@ -6300,45 +6324,45 @@ REPETITION
|
|
which matches one digit by preference, but can match two if that is the
|
|
only way the rest of the pattern matches.
|
|
|
|
- If the PCRE_UNGREEDY option is set (an option that is not available in
|
|
- Perl), the quantifiers are not greedy by default, but individual ones
|
|
- can be made greedy by following them with a question mark. In other
|
|
+ If the PCRE_UNGREEDY option is set (an option that is not available in
|
|
+ Perl), the quantifiers are not greedy by default, but individual ones
|
|
+ can be made greedy by following them with a question mark. In other
|
|
words, it inverts the default behaviour.
|
|
|
|
- When a parenthesized subpattern is quantified with a minimum repeat
|
|
- count that is greater than 1 or with a limited maximum, more memory is
|
|
- required for the compiled pattern, in proportion to the size of the
|
|
+ When a parenthesized subpattern is quantified with a minimum repeat
|
|
+ count that is greater than 1 or with a limited maximum, more memory is
|
|
+ required for the compiled pattern, in proportion to the size of the
|
|
minimum or maximum.
|
|
|
|
If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equiv-
|
|
- alent to Perl's /s) is set, thus allowing the dot to match newlines,
|
|
- the pattern is implicitly anchored, because whatever follows will be
|
|
- tried against every character position in the subject string, so there
|
|
- is no point in retrying the overall match at any position after the
|
|
- first. PCRE normally treats such a pattern as though it were preceded
|
|
+ alent to Perl's /s) is set, thus allowing the dot to match newlines,
|
|
+ the pattern is implicitly anchored, because whatever follows will be
|
|
+ tried against every character position in the subject string, so there
|
|
+ is no point in retrying the overall match at any position after the
|
|
+ first. PCRE normally treats such a pattern as though it were preceded
|
|
by \A.
|
|
|
|
- In cases where it is known that the subject string contains no new-
|
|
- lines, it is worth setting PCRE_DOTALL in order to obtain this opti-
|
|
+ In cases where it is known that the subject string contains no new-
|
|
+ lines, it is worth setting PCRE_DOTALL in order to obtain this opti-
|
|
mization, or alternatively using ^ to indicate anchoring explicitly.
|
|
|
|
- However, there are some cases where the optimization cannot be used.
|
|
+ However, there are some cases where the optimization cannot be used.
|
|
When .* is inside capturing parentheses that are the subject of a back
|
|
reference elsewhere in the pattern, a match at the start may fail where
|
|
a later one succeeds. Consider, for example:
|
|
|
|
(.*)abc\1
|
|
|
|
- If the subject is "xyz123abc123" the match point is the fourth charac-
|
|
+ If the subject is "xyz123abc123" the match point is the fourth charac-
|
|
ter. For this reason, such a pattern is not implicitly anchored.
|
|
|
|
- Another case where implicit anchoring is not applied is when the lead-
|
|
- ing .* is inside an atomic group. Once again, a match at the start may
|
|
+ Another case where implicit anchoring is not applied is when the lead-
|
|
+ ing .* is inside an atomic group. Once again, a match at the start may
|
|
fail where a later one succeeds. Consider this pattern:
|
|
|
|
(?>.*?a)b
|
|
|
|
- It matches "ab" in the subject "aab". The use of the backtracking con-
|
|
+ It matches "ab" in the subject "aab". The use of the backtracking con-
|
|
trol verbs (*PRUNE) and (*SKIP) also disable this optimization.
|
|
|
|
When a capturing subpattern is repeated, the value captured is the sub-
|
|
@@ -6347,8 +6371,8 @@ REPETITION
|
|
(tweedle[dume]{3}\s*)+
|
|
|
|
has matched "tweedledum tweedledee" the value of the captured substring
|
|
- is "tweedledee". However, if there are nested capturing subpatterns,
|
|
- the corresponding captured values may have been set in previous itera-
|
|
+ is "tweedledee". However, if there are nested capturing subpatterns,
|
|
+ the corresponding captured values may have been set in previous itera-
|
|
tions. For example, after
|
|
|
|
/(a|(b))+/
|
|
@@ -6358,53 +6382,53 @@ REPETITION
|
|
|
|
ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS
|
|
|
|
- With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy")
|
|
- repetition, failure of what follows normally causes the repeated item
|
|
- to be re-evaluated to see if a different number of repeats allows the
|
|
- rest of the pattern to match. Sometimes it is useful to prevent this,
|
|
- either to change the nature of the match, or to cause it fail earlier
|
|
- than it otherwise might, when the author of the pattern knows there is
|
|
+ With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy")
|
|
+ repetition, failure of what follows normally causes the repeated item
|
|
+ to be re-evaluated to see if a different number of repeats allows the
|
|
+ rest of the pattern to match. Sometimes it is useful to prevent this,
|
|
+ either to change the nature of the match, or to cause it fail earlier
|
|
+ than it otherwise might, when the author of the pattern knows there is
|
|
no point in carrying on.
|
|
|
|
- Consider, for example, the pattern \d+foo when applied to the subject
|
|
+ Consider, for example, the pattern \d+foo when applied to the subject
|
|
line
|
|
|
|
123456bar
|
|
|
|
After matching all 6 digits and then failing to match "foo", the normal
|
|
- action of the matcher is to try again with only 5 digits matching the
|
|
- \d+ item, and then with 4, and so on, before ultimately failing.
|
|
- "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides
|
|
- the means for specifying that once a subpattern has matched, it is not
|
|
+ action of the matcher is to try again with only 5 digits matching the
|
|
+ \d+ item, and then with 4, and so on, before ultimately failing.
|
|
+ "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides
|
|
+ the means for specifying that once a subpattern has matched, it is not
|
|
to be re-evaluated in this way.
|
|
|
|
- If we use atomic grouping for the previous example, the matcher gives
|
|
- up immediately on failing to match "foo" the first time. The notation
|
|
+ If we use atomic grouping for the previous example, the matcher gives
|
|
+ up immediately on failing to match "foo" the first time. The notation
|
|
is a kind of special parenthesis, starting with (?> as in this example:
|
|
|
|
(?>\d+)foo
|
|
|
|
- This kind of parenthesis "locks up" the part of the pattern it con-
|
|
- tains once it has matched, and a failure further into the pattern is
|
|
- prevented from backtracking into it. Backtracking past it to previous
|
|
+ This kind of parenthesis "locks up" the part of the pattern it con-
|
|
+ tains once it has matched, and a failure further into the pattern is
|
|
+ prevented from backtracking into it. Backtracking past it to previous
|
|
items, however, works as normal.
|
|
|
|
- An alternative description is that a subpattern of this type matches
|
|
- the string of characters that an identical standalone pattern would
|
|
+ An alternative description is that a subpattern of this type matches
|
|
+ the string of characters that an identical standalone pattern would
|
|
match, if anchored at the current point in the subject string.
|
|
|
|
Atomic grouping subpatterns are not capturing subpatterns. Simple cases
|
|
such as the above example can be thought of as a maximizing repeat that
|
|
- must swallow everything it can. So, while both \d+ and \d+? are pre-
|
|
- pared to adjust the number of digits they match in order to make the
|
|
+ must swallow everything it can. So, while both \d+ and \d+? are pre-
|
|
+ pared to adjust the number of digits they match in order to make the
|
|
rest of the pattern match, (?>\d+) can only match an entire sequence of
|
|
digits.
|
|
|
|
- Atomic groups in general can of course contain arbitrarily complicated
|
|
- subpatterns, and can be nested. However, when the subpattern for an
|
|
+ Atomic groups in general can of course contain arbitrarily complicated
|
|
+ subpatterns, and can be nested. However, when the subpattern for an
|
|
atomic group is just a single repeated item, as in the example above, a
|
|
- simpler notation, called a "possessive quantifier" can be used. This
|
|
- consists of an additional + character following a quantifier. Using
|
|
+ simpler notation, called a "possessive quantifier" can be used. This
|
|
+ consists of an additional + character following a quantifier. Using
|
|
this notation, the previous example can be rewritten as
|
|
|
|
\d++foo
|
|
@@ -6414,45 +6438,45 @@ ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS
|
|
|
|
(abc|xyz){2,3}+
|
|
|
|
- Possessive quantifiers are always greedy; the setting of the
|
|
+ Possessive quantifiers are always greedy; the setting of the
|
|
PCRE_UNGREEDY option is ignored. They are a convenient notation for the
|
|
- simpler forms of atomic group. However, there is no difference in the
|
|
- meaning of a possessive quantifier and the equivalent atomic group,
|
|
- though there may be a performance difference; possessive quantifiers
|
|
+ simpler forms of atomic group. However, there is no difference in the
|
|
+ meaning of a possessive quantifier and the equivalent atomic group,
|
|
+ though there may be a performance difference; possessive quantifiers
|
|
should be slightly faster.
|
|
|
|
- The possessive quantifier syntax is an extension to the Perl 5.8 syn-
|
|
- tax. Jeffrey Friedl originated the idea (and the name) in the first
|
|
+ The possessive quantifier syntax is an extension to the Perl 5.8 syn-
|
|
+ tax. Jeffrey Friedl originated the idea (and the name) in the first
|
|
edition of his book. Mike McCloskey liked it, so implemented it when he
|
|
- built Sun's Java package, and PCRE copied it from there. It ultimately
|
|
+ built Sun's Java package, and PCRE copied it from there. It ultimately
|
|
found its way into Perl at release 5.10.
|
|
|
|
PCRE has an optimization that automatically "possessifies" certain sim-
|
|
- ple pattern constructs. For example, the sequence A+B is treated as
|
|
- A++B because there is no point in backtracking into a sequence of A's
|
|
+ ple pattern constructs. For example, the sequence A+B is treated as
|
|
+ A++B because there is no point in backtracking into a sequence of A's
|
|
when B must follow.
|
|
|
|
- When a pattern contains an unlimited repeat inside a subpattern that
|
|
- can itself be repeated an unlimited number of times, the use of an
|
|
- atomic group is the only way to avoid some failing matches taking a
|
|
+ When a pattern contains an unlimited repeat inside a subpattern that
|
|
+ can itself be repeated an unlimited number of times, the use of an
|
|
+ atomic group is the only way to avoid some failing matches taking a
|
|
very long time indeed. The pattern
|
|
|
|
(\D+|<\d+>)*[!?]
|
|
|
|
- matches an unlimited number of substrings that either consist of non-
|
|
- digits, or digits enclosed in <>, followed by either ! or ?. When it
|
|
+ matches an unlimited number of substrings that either consist of non-
|
|
+ digits, or digits enclosed in <>, followed by either ! or ?. When it
|
|
matches, it runs quickly. However, if it is applied to
|
|
|
|
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
|
|
|
- it takes a long time before reporting failure. This is because the
|
|
- string can be divided between the internal \D+ repeat and the external
|
|
- * repeat in a large number of ways, and all have to be tried. (The
|
|
- example uses [!?] rather than a single character at the end, because
|
|
- both PCRE and Perl have an optimization that allows for fast failure
|
|
- when a single character is used. They remember the last single charac-
|
|
- ter that is required for a match, and fail early if it is not present
|
|
- in the string.) If the pattern is changed so that it uses an atomic
|
|
+ it takes a long time before reporting failure. This is because the
|
|
+ string can be divided between the internal \D+ repeat and the external
|
|
+ * repeat in a large number of ways, and all have to be tried. (The
|
|
+ example uses [!?] rather than a single character at the end, because
|
|
+ both PCRE and Perl have an optimization that allows for fast failure
|
|
+ when a single character is used. They remember the last single charac-
|
|
+ ter that is required for a match, and fail early if it is not present
|
|
+ in the string.) If the pattern is changed so that it uses an atomic
|
|
group, like this:
|
|
|
|
((?>\D+)|<\d+>)*[!?]
|
|
@@ -6464,28 +6488,28 @@ BACK REFERENCES
|
|
|
|
Outside a character class, a backslash followed by a digit greater than
|
|
0 (and possibly further digits) is a back reference to a capturing sub-
|
|
- pattern earlier (that is, to its left) in the pattern, provided there
|
|
+ pattern earlier (that is, to its left) in the pattern, provided there
|
|
have been that many previous capturing left parentheses.
|
|
|
|
However, if the decimal number following the backslash is less than 10,
|
|
- it is always taken as a back reference, and causes an error only if
|
|
- there are not that many capturing left parentheses in the entire pat-
|
|
- tern. In other words, the parentheses that are referenced need not be
|
|
- to the left of the reference for numbers less than 10. A "forward back
|
|
- reference" of this type can make sense when a repetition is involved
|
|
- and the subpattern to the right has participated in an earlier itera-
|
|
+ it is always taken as a back reference, and causes an error only if
|
|
+ there are not that many capturing left parentheses in the entire pat-
|
|
+ tern. In other words, the parentheses that are referenced need not be
|
|
+ to the left of the reference for numbers less than 10. A "forward back
|
|
+ reference" of this type can make sense when a repetition is involved
|
|
+ and the subpattern to the right has participated in an earlier itera-
|
|
tion.
|
|
|
|
- It is not possible to have a numerical "forward back reference" to a
|
|
- subpattern whose number is 10 or more using this syntax because a
|
|
- sequence such as \50 is interpreted as a character defined in octal.
|
|
+ It is not possible to have a numerical "forward back reference" to a
|
|
+ subpattern whose number is 10 or more using this syntax because a
|
|
+ sequence such as \50 is interpreted as a character defined in octal.
|
|
See the subsection entitled "Non-printing characters" above for further
|
|
- details of the handling of digits following a backslash. There is no
|
|
- such problem when named parentheses are used. A back reference to any
|
|
+ details of the handling of digits following a backslash. There is no
|
|
+ such problem when named parentheses are used. A back reference to any
|
|
subpattern is possible using named parentheses (see below).
|
|
|
|
- Another way of avoiding the ambiguity inherent in the use of digits
|
|
- following a backslash is to use the \g escape sequence. This escape
|
|
+ Another way of avoiding the ambiguity inherent in the use of digits
|
|
+ following a backslash is to use the \g escape sequence. This escape
|
|
must be followed by an unsigned number or a negative number, optionally
|
|
enclosed in braces. These examples are all identical:
|
|
|
|
@@ -6493,7 +6517,7 @@ BACK REFERENCES
|
|
(ring), \g1
|
|
(ring), \g{1}
|
|
|
|
- An unsigned number specifies an absolute reference without the ambigu-
|
|
+ An unsigned number specifies an absolute reference without the ambigu-
|
|
ity that is present in the older syntax. It is also useful when literal
|
|
digits follow the reference. A negative number is a relative reference.
|
|
Consider this example:
|
|
@@ -6502,33 +6526,33 @@ BACK REFERENCES
|
|
|
|
The sequence \g{-1} is a reference to the most recently started captur-
|
|
ing subpattern before \g, that is, is it equivalent to \2 in this exam-
|
|
- ple. Similarly, \g{-2} would be equivalent to \1. The use of relative
|
|
- references can be helpful in long patterns, and also in patterns that
|
|
- are created by joining together fragments that contain references
|
|
+ ple. Similarly, \g{-2} would be equivalent to \1. The use of relative
|
|
+ references can be helpful in long patterns, and also in patterns that
|
|
+ are created by joining together fragments that contain references
|
|
within themselves.
|
|
|
|
- A back reference matches whatever actually matched the capturing sub-
|
|
- pattern in the current subject string, rather than anything matching
|
|
+ A back reference matches whatever actually matched the capturing sub-
|
|
+ pattern in the current subject string, rather than anything matching
|
|
the subpattern itself (see "Subpatterns as subroutines" below for a way
|
|
of doing that). So the pattern
|
|
|
|
(sens|respons)e and \1ibility
|
|
|
|
- matches "sense and sensibility" and "response and responsibility", but
|
|
- not "sense and responsibility". If caseful matching is in force at the
|
|
- time of the back reference, the case of letters is relevant. For exam-
|
|
+ matches "sense and sensibility" and "response and responsibility", but
|
|
+ not "sense and responsibility". If caseful matching is in force at the
|
|
+ time of the back reference, the case of letters is relevant. For exam-
|
|
ple,
|
|
|
|
((?i)rah)\s+\1
|
|
|
|
- matches "rah rah" and "RAH RAH", but not "RAH rah", even though the
|
|
+ matches "rah rah" and "RAH RAH", but not "RAH rah", even though the
|
|
original capturing subpattern is matched caselessly.
|
|
|
|
- There are several different ways of writing back references to named
|
|
- subpatterns. The .NET syntax \k{name} and the Perl syntax \k<name> or
|
|
- \k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's
|
|
+ There are several different ways of writing back references to named
|
|
+ subpatterns. The .NET syntax \k{name} and the Perl syntax \k<name> or
|
|
+ \k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's
|
|
unified back reference syntax, in which \g can be used for both numeric
|
|
- and named references, is also supported. We could rewrite the above
|
|
+ and named references, is also supported. We could rewrite the above
|
|
example in any of the following ways:
|
|
|
|
(?<p1>(?i)rah)\s+\k<p1>
|
|
@@ -6536,84 +6560,84 @@ BACK REFERENCES
|
|
(?P<p1>(?i)rah)\s+(?P=p1)
|
|
(?<p1>(?i)rah)\s+\g{p1}
|
|
|
|
- A subpattern that is referenced by name may appear in the pattern
|
|
+ A subpattern that is referenced by name may appear in the pattern
|
|
before or after the reference.
|
|
|
|
- There may be more than one back reference to the same subpattern. If a
|
|
- subpattern has not actually been used in a particular match, any back
|
|
+ There may be more than one back reference to the same subpattern. If a
|
|
+ subpattern has not actually been used in a particular match, any back
|
|
references to it always fail by default. For example, the pattern
|
|
|
|
(a|(bc))\2
|
|
|
|
- always fails if it starts to match "a" rather than "bc". However, if
|
|
+ always fails if it starts to match "a" rather than "bc". However, if
|
|
the PCRE_JAVASCRIPT_COMPAT option is set at compile time, a back refer-
|
|
ence to an unset value matches an empty string.
|
|
|
|
- Because there may be many capturing parentheses in a pattern, all dig-
|
|
- its following a backslash are taken as part of a potential back refer-
|
|
- ence number. If the pattern continues with a digit character, some
|
|
- delimiter must be used to terminate the back reference. If the
|
|
- PCRE_EXTENDED option is set, this can be white space. Otherwise, the
|
|
+ Because there may be many capturing parentheses in a pattern, all dig-
|
|
+ its following a backslash are taken as part of a potential back refer-
|
|
+ ence number. If the pattern continues with a digit character, some
|
|
+ delimiter must be used to terminate the back reference. If the
|
|
+ PCRE_EXTENDED option is set, this can be white space. Otherwise, the
|
|
\g{ syntax or an empty comment (see "Comments" below) can be used.
|
|
|
|
Recursive back references
|
|
|
|
- A back reference that occurs inside the parentheses to which it refers
|
|
- fails when the subpattern is first used, so, for example, (a\1) never
|
|
- matches. However, such references can be useful inside repeated sub-
|
|
+ A back reference that occurs inside the parentheses to which it refers
|
|
+ fails when the subpattern is first used, so, for example, (a\1) never
|
|
+ matches. However, such references can be useful inside repeated sub-
|
|
patterns. For example, the pattern
|
|
|
|
(a|b\1)+
|
|
|
|
matches any number of "a"s and also "aba", "ababbaa" etc. At each iter-
|
|
- ation of the subpattern, the back reference matches the character
|
|
- string corresponding to the previous iteration. In order for this to
|
|
- work, the pattern must be such that the first iteration does not need
|
|
- to match the back reference. This can be done using alternation, as in
|
|
+ ation of the subpattern, the back reference matches the character
|
|
+ string corresponding to the previous iteration. In order for this to
|
|
+ work, the pattern must be such that the first iteration does not need
|
|
+ to match the back reference. This can be done using alternation, as in
|
|
the example above, or by a quantifier with a minimum of zero.
|
|
|
|
- Back references of this type cause the group that they reference to be
|
|
- treated as an atomic group. Once the whole group has been matched, a
|
|
- subsequent matching failure cannot cause backtracking into the middle
|
|
+ Back references of this type cause the group that they reference to be
|
|
+ treated as an atomic group. Once the whole group has been matched, a
|
|
+ subsequent matching failure cannot cause backtracking into the middle
|
|
of the group.
|
|
|
|
|
|
ASSERTIONS
|
|
|
|
- An assertion is a test on the characters following or preceding the
|
|
- current matching point that does not actually consume any characters.
|
|
- The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are
|
|
+ An assertion is a test on the characters following or preceding the
|
|
+ current matching point that does not actually consume any characters.
|
|
+ The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are
|
|
described above.
|
|
|
|
- More complicated assertions are coded as subpatterns. There are two
|
|
- kinds: those that look ahead of the current position in the subject
|
|
- string, and those that look behind it. An assertion subpattern is
|
|
- matched in the normal way, except that it does not cause the current
|
|
+ More complicated assertions are coded as subpatterns. There are two
|
|
+ kinds: those that look ahead of the current position in the subject
|
|
+ string, and those that look behind it. An assertion subpattern is
|
|
+ matched in the normal way, except that it does not cause the current
|
|
matching position to be changed.
|
|
|
|
- Assertion subpatterns are not capturing subpatterns. If such an asser-
|
|
- tion contains capturing subpatterns within it, these are counted for
|
|
- the purposes of numbering the capturing subpatterns in the whole pat-
|
|
- tern. However, substring capturing is carried out only for positive
|
|
+ Assertion subpatterns are not capturing subpatterns. If such an asser-
|
|
+ tion contains capturing subpatterns within it, these are counted for
|
|
+ the purposes of numbering the capturing subpatterns in the whole pat-
|
|
+ tern. However, substring capturing is carried out only for positive
|
|
assertions. (Perl sometimes, but not always, does do capturing in nega-
|
|
tive assertions.)
|
|
|
|
- For compatibility with Perl, assertion subpatterns may be repeated;
|
|
- though it makes no sense to assert the same thing several times, the
|
|
- side effect of capturing parentheses may occasionally be useful. In
|
|
+ For compatibility with Perl, assertion subpatterns may be repeated;
|
|
+ though it makes no sense to assert the same thing several times, the
|
|
+ side effect of capturing parentheses may occasionally be useful. In
|
|
practice, there only three cases:
|
|
|
|
- (1) If the quantifier is {0}, the assertion is never obeyed during
|
|
- matching. However, it may contain internal capturing parenthesized
|
|
+ (1) If the quantifier is {0}, the assertion is never obeyed during
|
|
+ matching. However, it may contain internal capturing parenthesized
|
|
groups that are called from elsewhere via the subroutine mechanism.
|
|
|
|
- (2) If quantifier is {0,n} where n is greater than zero, it is treated
|
|
- as if it were {0,1}. At run time, the rest of the pattern match is
|
|
+ (2) If quantifier is {0,n} where n is greater than zero, it is treated
|
|
+ as if it were {0,1}. At run time, the rest of the pattern match is
|
|
tried with and without the assertion, the order depending on the greed-
|
|
iness of the quantifier.
|
|
|
|
- (3) If the minimum repetition is greater than zero, the quantifier is
|
|
- ignored. The assertion is obeyed just once when encountered during
|
|
+ (3) If the minimum repetition is greater than zero, the quantifier is
|
|
+ ignored. The assertion is obeyed just once when encountered during
|
|
matching.
|
|
|
|
Lookahead assertions
|
|
@@ -6623,38 +6647,38 @@ ASSERTIONS
|
|
|
|
\w+(?=;)
|
|
|
|
- matches a word followed by a semicolon, but does not include the semi-
|
|
+ matches a word followed by a semicolon, but does not include the semi-
|
|
colon in the match, and
|
|
|
|
foo(?!bar)
|
|
|
|
- matches any occurrence of "foo" that is not followed by "bar". Note
|
|
+ matches any occurrence of "foo" that is not followed by "bar". Note
|
|
that the apparently similar pattern
|
|
|
|
(?!foo)bar
|
|
|
|
- does not find an occurrence of "bar" that is preceded by something
|
|
- other than "foo"; it finds any occurrence of "bar" whatsoever, because
|
|
+ does not find an occurrence of "bar" that is preceded by something
|
|
+ other than "foo"; it finds any occurrence of "bar" whatsoever, because
|
|
the assertion (?!foo) is always true when the next three characters are
|
|
"bar". A lookbehind assertion is needed to achieve the other effect.
|
|
|
|
If you want to force a matching failure at some point in a pattern, the
|
|
- most convenient way to do it is with (?!) because an empty string
|
|
- always matches, so an assertion that requires there not to be an empty
|
|
+ most convenient way to do it is with (?!) because an empty string
|
|
+ always matches, so an assertion that requires there not to be an empty
|
|
string must always fail. The backtracking control verb (*FAIL) or (*F)
|
|
is a synonym for (?!).
|
|
|
|
Lookbehind assertions
|
|
|
|
- Lookbehind assertions start with (?<= for positive assertions and (?<!
|
|
+ Lookbehind assertions start with (?<= for positive assertions and (?<!
|
|
for negative assertions. For example,
|
|
|
|
(?<!foo)bar
|
|
|
|
- does find an occurrence of "bar" that is not preceded by "foo". The
|
|
- contents of a lookbehind assertion are restricted such that all the
|
|
+ does find an occurrence of "bar" that is not preceded by "foo". The
|
|
+ contents of a lookbehind assertion are restricted such that all the
|
|
strings it matches must have a fixed length. However, if there are sev-
|
|
- eral top-level alternatives, they do not all have to have the same
|
|
+ eral top-level alternatives, they do not all have to have the same
|
|
fixed length. Thus
|
|
|
|
(?<=bullock|donkey)
|
|
@@ -6663,62 +6687,62 @@ ASSERTIONS
|
|
|
|
(?<!dogs?|cats?)
|
|
|
|
- causes an error at compile time. Branches that match different length
|
|
- strings are permitted only at the top level of a lookbehind assertion.
|
|
+ causes an error at compile time. Branches that match different length
|
|
+ strings are permitted only at the top level of a lookbehind assertion.
|
|
This is an extension compared with Perl, which requires all branches to
|
|
match the same length of string. An assertion such as
|
|
|
|
(?<=ab(c|de))
|
|
|
|
- is not permitted, because its single top-level branch can match two
|
|
+ is not permitted, because its single top-level branch can match two
|
|
different lengths, but it is acceptable to PCRE if rewritten to use two
|
|
top-level branches:
|
|
|
|
(?<=abc|abde)
|
|
|
|
- In some cases, the escape sequence \K (see above) can be used instead
|
|
+ In some cases, the escape sequence \K (see above) can be used instead
|
|
of a lookbehind assertion to get round the fixed-length restriction.
|
|
|
|
- The implementation of lookbehind assertions is, for each alternative,
|
|
- to temporarily move the current position back by the fixed length and
|
|
+ The implementation of lookbehind assertions is, for each alternative,
|
|
+ to temporarily move the current position back by the fixed length and
|
|
then try to match. If there are insufficient characters before the cur-
|
|
rent position, the assertion fails.
|
|
|
|
- In a UTF mode, PCRE does not allow the \C escape (which matches a sin-
|
|
- gle data unit even in a UTF mode) to appear in lookbehind assertions,
|
|
- because it makes it impossible to calculate the length of the lookbe-
|
|
- hind. The \X and \R escapes, which can match different numbers of data
|
|
+ In a UTF mode, PCRE does not allow the \C escape (which matches a sin-
|
|
+ gle data unit even in a UTF mode) to appear in lookbehind assertions,
|
|
+ because it makes it impossible to calculate the length of the lookbe-
|
|
+ hind. The \X and \R escapes, which can match different numbers of data
|
|
units, are also not permitted.
|
|
|
|
- "Subroutine" calls (see below) such as (?2) or (?&X) are permitted in
|
|
- lookbehinds, as long as the subpattern matches a fixed-length string.
|
|
+ "Subroutine" calls (see below) such as (?2) or (?&X) are permitted in
|
|
+ lookbehinds, as long as the subpattern matches a fixed-length string.
|
|
Recursion, however, is not supported.
|
|
|
|
- Possessive quantifiers can be used in conjunction with lookbehind
|
|
+ Possessive quantifiers can be used in conjunction with lookbehind
|
|
assertions to specify efficient matching of fixed-length strings at the
|
|
end of subject strings. Consider a simple pattern such as
|
|
|
|
abcd$
|
|
|
|
- when applied to a long string that does not match. Because matching
|
|
+ when applied to a long string that does not match. Because matching
|
|
proceeds from left to right, PCRE will look for each "a" in the subject
|
|
- and then see if what follows matches the rest of the pattern. If the
|
|
+ and then see if what follows matches the rest of the pattern. If the
|
|
pattern is specified as
|
|
|
|
^.*abcd$
|
|
|
|
- the initial .* matches the entire string at first, but when this fails
|
|
+ the initial .* matches the entire string at first, but when this fails
|
|
(because there is no following "a"), it backtracks to match all but the
|
|
- last character, then all but the last two characters, and so on. Once
|
|
- again the search for "a" covers the entire string, from right to left,
|
|
+ last character, then all but the last two characters, and so on. Once
|
|
+ again the search for "a" covers the entire string, from right to left,
|
|
so we are no better off. However, if the pattern is written as
|
|
|
|
^.*+(?<=abcd)
|
|
|
|
- there can be no backtracking for the .*+ item; it can match only the
|
|
- entire string. The subsequent lookbehind assertion does a single test
|
|
- on the last four characters. If it fails, the match fails immediately.
|
|
- For long strings, this approach makes a significant difference to the
|
|
+ there can be no backtracking for the .*+ item; it can match only the
|
|
+ entire string. The subsequent lookbehind assertion does a single test
|
|
+ on the last four characters. If it fails, the match fails immediately.
|
|
+ For long strings, this approach makes a significant difference to the
|
|
processing time.
|
|
|
|
Using multiple assertions
|
|
@@ -6727,18 +6751,18 @@ ASSERTIONS
|
|
|
|
(?<=\d{3})(?<!999)foo
|
|
|
|
- matches "foo" preceded by three digits that are not "999". Notice that
|
|
- each of the assertions is applied independently at the same point in
|
|
- the subject string. First there is a check that the previous three
|
|
- characters are all digits, and then there is a check that the same
|
|
+ matches "foo" preceded by three digits that are not "999". Notice that
|
|
+ each of the assertions is applied independently at the same point in
|
|
+ the subject string. First there is a check that the previous three
|
|
+ characters are all digits, and then there is a check that the same
|
|
three characters are not "999". This pattern does not match "foo" pre-
|
|
- ceded by six characters, the first of which are digits and the last
|
|
- three of which are not "999". For example, it doesn't match "123abc-
|
|
+ ceded by six characters, the first of which are digits and the last
|
|
+ three of which are not "999". For example, it doesn't match "123abc-
|
|
foo". A pattern to do that is
|
|
|
|
(?<=\d{3}...)(?<!999)foo
|
|
|
|
- This time the first assertion looks at the preceding six characters,
|
|
+ This time the first assertion looks at the preceding six characters,
|
|
checking that the first three are digits, and then the second assertion
|
|
checks that the preceding three characters are not "999".
|
|
|
|
@@ -6746,29 +6770,29 @@ ASSERTIONS
|
|
|
|
(?<=(?<!foo)bar)baz
|
|
|
|
- matches an occurrence of "baz" that is preceded by "bar" which in turn
|
|
+ matches an occurrence of "baz" that is preceded by "bar" which in turn
|
|
is not preceded by "foo", while
|
|
|
|
(?<=\d{3}(?!999)...)foo
|
|
|
|
- is another pattern that matches "foo" preceded by three digits and any
|
|
+ is another pattern that matches "foo" preceded by three digits and any
|
|
three characters that are not "999".
|
|
|
|
|
|
CONDITIONAL SUBPATTERNS
|
|
|
|
- It is possible to cause the matching process to obey a subpattern con-
|
|
- ditionally or to choose between two alternative subpatterns, depending
|
|
- on the result of an assertion, or whether a specific capturing subpat-
|
|
- tern has already been matched. The two possible forms of conditional
|
|
+ It is possible to cause the matching process to obey a subpattern con-
|
|
+ ditionally or to choose between two alternative subpatterns, depending
|
|
+ on the result of an assertion, or whether a specific capturing subpat-
|
|
+ tern has already been matched. The two possible forms of conditional
|
|
subpattern are:
|
|
|
|
(?(condition)yes-pattern)
|
|
(?(condition)yes-pattern|no-pattern)
|
|
|
|
- If the condition is satisfied, the yes-pattern is used; otherwise the
|
|
- no-pattern (if present) is used. If there are more than two alterna-
|
|
- tives in the subpattern, a compile-time error occurs. Each of the two
|
|
+ If the condition is satisfied, the yes-pattern is used; otherwise the
|
|
+ no-pattern (if present) is used. If there are more than two alterna-
|
|
+ tives in the subpattern, a compile-time error occurs. Each of the two
|
|
alternatives may itself contain nested subpatterns of any form, includ-
|
|
ing conditional subpatterns; the restriction to two alternatives
|
|
applies only at the level of the condition. This pattern fragment is an
|
|
@@ -6777,68 +6801,68 @@ CONDITIONAL SUBPATTERNS
|
|
(?(1) (A|B|C) | (D | (?(2)E|F) | E) )
|
|
|
|
|
|
- There are four kinds of condition: references to subpatterns, refer-
|
|
+ There are four kinds of condition: references to subpatterns, refer-
|
|
ences to recursion, a pseudo-condition called DEFINE, and assertions.
|
|
|
|
Checking for a used subpattern by number
|
|
|
|
- If the text between the parentheses consists of a sequence of digits,
|
|
+ If the text between the parentheses consists of a sequence of digits,
|
|
the condition is true if a capturing subpattern of that number has pre-
|
|
- viously matched. If there is more than one capturing subpattern with
|
|
- the same number (see the earlier section about duplicate subpattern
|
|
- numbers), the condition is true if any of them have matched. An alter-
|
|
- native notation is to precede the digits with a plus or minus sign. In
|
|
- this case, the subpattern number is relative rather than absolute. The
|
|
- most recently opened parentheses can be referenced by (?(-1), the next
|
|
- most recent by (?(-2), and so on. Inside loops it can also make sense
|
|
+ viously matched. If there is more than one capturing subpattern with
|
|
+ the same number (see the earlier section about duplicate subpattern
|
|
+ numbers), the condition is true if any of them have matched. An alter-
|
|
+ native notation is to precede the digits with a plus or minus sign. In
|
|
+ this case, the subpattern number is relative rather than absolute. The
|
|
+ most recently opened parentheses can be referenced by (?(-1), the next
|
|
+ most recent by (?(-2), and so on. Inside loops it can also make sense
|
|
to refer to subsequent groups. The next parentheses to be opened can be
|
|
- referenced as (?(+1), and so on. (The value zero in any of these forms
|
|
+ referenced as (?(+1), and so on. (The value zero in any of these forms
|
|
is not used; it provokes a compile-time error.)
|
|
|
|
- Consider the following pattern, which contains non-significant white
|
|
+ Consider the following pattern, which contains non-significant white
|
|
space to make it more readable (assume the PCRE_EXTENDED option) and to
|
|
divide it into three parts for ease of discussion:
|
|
|
|
( \( )? [^()]+ (?(1) \) )
|
|
|
|
- The first part matches an optional opening parenthesis, and if that
|
|
+ The first part matches an optional opening parenthesis, and if that
|
|
character is present, sets it as the first captured substring. The sec-
|
|
- ond part matches one or more characters that are not parentheses. The
|
|
- third part is a conditional subpattern that tests whether or not the
|
|
- first set of parentheses matched. If they did, that is, if subject
|
|
- started with an opening parenthesis, the condition is true, and so the
|
|
- yes-pattern is executed and a closing parenthesis is required. Other-
|
|
- wise, since no-pattern is not present, the subpattern matches nothing.
|
|
- In other words, this pattern matches a sequence of non-parentheses,
|
|
+ ond part matches one or more characters that are not parentheses. The
|
|
+ third part is a conditional subpattern that tests whether or not the
|
|
+ first set of parentheses matched. If they did, that is, if subject
|
|
+ started with an opening parenthesis, the condition is true, and so the
|
|
+ yes-pattern is executed and a closing parenthesis is required. Other-
|
|
+ wise, since no-pattern is not present, the subpattern matches nothing.
|
|
+ In other words, this pattern matches a sequence of non-parentheses,
|
|
optionally enclosed in parentheses.
|
|
|
|
- If you were embedding this pattern in a larger one, you could use a
|
|
+ If you were embedding this pattern in a larger one, you could use a
|
|
relative reference:
|
|
|
|
...other stuff... ( \( )? [^()]+ (?(-1) \) ) ...
|
|
|
|
- This makes the fragment independent of the parentheses in the larger
|
|
+ This makes the fragment independent of the parentheses in the larger
|
|
pattern.
|
|
|
|
Checking for a used subpattern by name
|
|
|
|
- Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a
|
|
- used subpattern by name. For compatibility with earlier versions of
|
|
- PCRE, which had this facility before Perl, the syntax (?(name)...) is
|
|
+ Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a
|
|
+ used subpattern by name. For compatibility with earlier versions of
|
|
+ PCRE, which had this facility before Perl, the syntax (?(name)...) is
|
|
also recognized.
|
|
|
|
Rewriting the above example to use a named subpattern gives this:
|
|
|
|
(?<OPEN> \( )? [^()]+ (?(<OPEN>) \) )
|
|
|
|
- If the name used in a condition of this kind is a duplicate, the test
|
|
- is applied to all subpatterns of the same name, and is true if any one
|
|
+ If the name used in a condition of this kind is a duplicate, the test
|
|
+ is applied to all subpatterns of the same name, and is true if any one
|
|
of them has matched.
|
|
|
|
Checking for pattern recursion
|
|
|
|
If the condition is the string (R), and there is no subpattern with the
|
|
- name R, the condition is true if a recursive call to the whole pattern
|
|
+ name R, the condition is true if a recursive call to the whole pattern
|
|
or any subpattern has been made. If digits or a name preceded by amper-
|
|
sand follow the letter R, for example:
|
|
|
|
@@ -6846,51 +6870,51 @@ CONDITIONAL SUBPATTERNS
|
|
|
|
the condition is true if the most recent recursion is into a subpattern
|
|
whose number or name is given. This condition does not check the entire
|
|
- recursion stack. If the name used in a condition of this kind is a
|
|
+ recursion stack. If the name used in a condition of this kind is a
|
|
duplicate, the test is applied to all subpatterns of the same name, and
|
|
is true if any one of them is the most recent recursion.
|
|
|
|
- At "top level", all these recursion test conditions are false. The
|
|
+ At "top level", all these recursion test conditions are false. The
|
|
syntax for recursive patterns is described below.
|
|
|
|
Defining subpatterns for use by reference only
|
|
|
|
- If the condition is the string (DEFINE), and there is no subpattern
|
|
- with the name DEFINE, the condition is always false. In this case,
|
|
- there may be only one alternative in the subpattern. It is always
|
|
- skipped if control reaches this point in the pattern; the idea of
|
|
- DEFINE is that it can be used to define subroutines that can be refer-
|
|
- enced from elsewhere. (The use of subroutines is described below.) For
|
|
- example, a pattern to match an IPv4 address such as "192.168.23.245"
|
|
+ If the condition is the string (DEFINE), and there is no subpattern
|
|
+ with the name DEFINE, the condition is always false. In this case,
|
|
+ there may be only one alternative in the subpattern. It is always
|
|
+ skipped if control reaches this point in the pattern; the idea of
|
|
+ DEFINE is that it can be used to define subroutines that can be refer-
|
|
+ enced from elsewhere. (The use of subroutines is described below.) For
|
|
+ example, a pattern to match an IPv4 address such as "192.168.23.245"
|
|
could be written like this (ignore white space and line breaks):
|
|
|
|
(?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
|
|
\b (?&byte) (\.(?&byte)){3} \b
|
|
|
|
- The first part of the pattern is a DEFINE group inside which a another
|
|
- group named "byte" is defined. This matches an individual component of
|
|
- an IPv4 address (a number less than 256). When matching takes place,
|
|
- this part of the pattern is skipped because DEFINE acts like a false
|
|
- condition. The rest of the pattern uses references to the named group
|
|
- to match the four dot-separated components of an IPv4 address, insist-
|
|
+ The first part of the pattern is a DEFINE group inside which a another
|
|
+ group named "byte" is defined. This matches an individual component of
|
|
+ an IPv4 address (a number less than 256). When matching takes place,
|
|
+ this part of the pattern is skipped because DEFINE acts like a false
|
|
+ condition. The rest of the pattern uses references to the named group
|
|
+ to match the four dot-separated components of an IPv4 address, insist-
|
|
ing on a word boundary at each end.
|
|
|
|
Assertion conditions
|
|
|
|
- If the condition is not in any of the above formats, it must be an
|
|
- assertion. This may be a positive or negative lookahead or lookbehind
|
|
- assertion. Consider this pattern, again containing non-significant
|
|
+ If the condition is not in any of the above formats, it must be an
|
|
+ assertion. This may be a positive or negative lookahead or lookbehind
|
|
+ assertion. Consider this pattern, again containing non-significant
|
|
white space, and with the two alternatives on the second line:
|
|
|
|
(?(?=[^a-z]*[a-z])
|
|
\d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
|
|
|
|
- The condition is a positive lookahead assertion that matches an
|
|
- optional sequence of non-letters followed by a letter. In other words,
|
|
- it tests for the presence of at least one letter in the subject. If a
|
|
- letter is found, the subject is matched against the first alternative;
|
|
- otherwise it is matched against the second. This pattern matches
|
|
- strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are
|
|
+ The condition is a positive lookahead assertion that matches an
|
|
+ optional sequence of non-letters followed by a letter. In other words,
|
|
+ it tests for the presence of at least one letter in the subject. If a
|
|
+ letter is found, the subject is matched against the first alternative;
|
|
+ otherwise it is matched against the second. This pattern matches
|
|
+ strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are
|
|
letters and dd are digits.
|
|
|
|
|
|
@@ -6899,41 +6923,41 @@ COMMENTS
|
|
There are two ways of including comments in patterns that are processed
|
|
by PCRE. In both cases, the start of the comment must not be in a char-
|
|
acter class, nor in the middle of any other sequence of related charac-
|
|
- ters such as (?: or a subpattern name or number. The characters that
|
|
+ ters such as (?: or a subpattern name or number. The characters that
|
|
make up a comment play no part in the pattern matching.
|
|
|
|
- The sequence (?# marks the start of a comment that continues up to the
|
|
- next closing parenthesis. Nested parentheses are not permitted. If the
|
|
+ The sequence (?# marks the start of a comment that continues up to the
|
|
+ next closing parenthesis. Nested parentheses are not permitted. If the
|
|
PCRE_EXTENDED option is set, an unescaped # character also introduces a
|
|
- comment, which in this case continues to immediately after the next
|
|
- newline character or character sequence in the pattern. Which charac-
|
|
+ comment, which in this case continues to immediately after the next
|
|
+ newline character or character sequence in the pattern. Which charac-
|
|
ters are interpreted as newlines is controlled by the options passed to
|
|
- a compiling function or by a special sequence at the start of the pat-
|
|
+ a compiling function or by a special sequence at the start of the pat-
|
|
tern, as described in the section entitled "Newline conventions" above.
|
|
Note that the end of this type of comment is a literal newline sequence
|
|
- in the pattern; escape sequences that happen to represent a newline do
|
|
- not count. For example, consider this pattern when PCRE_EXTENDED is
|
|
+ in the pattern; escape sequences that happen to represent a newline do
|
|
+ not count. For example, consider this pattern when PCRE_EXTENDED is
|
|
set, and the default newline convention is in force:
|
|
|
|
abc #comment \n still comment
|
|
|
|
- On encountering the # character, pcre_compile() skips along, looking
|
|
- for a newline in the pattern. The sequence \n is still literal at this
|
|
- stage, so it does not terminate the comment. Only an actual character
|
|
+ On encountering the # character, pcre_compile() skips along, looking
|
|
+ for a newline in the pattern. The sequence \n is still literal at this
|
|
+ stage, so it does not terminate the comment. Only an actual character
|
|
with the code value 0x0a (the default newline) does so.
|
|
|
|
|
|
RECURSIVE PATTERNS
|
|
|
|
- Consider the problem of matching a string in parentheses, allowing for
|
|
- unlimited nested parentheses. Without the use of recursion, the best
|
|
- that can be done is to use a pattern that matches up to some fixed
|
|
- depth of nesting. It is not possible to handle an arbitrary nesting
|
|
+ Consider the problem of matching a string in parentheses, allowing for
|
|
+ unlimited nested parentheses. Without the use of recursion, the best
|
|
+ that can be done is to use a pattern that matches up to some fixed
|
|
+ depth of nesting. It is not possible to handle an arbitrary nesting
|
|
depth.
|
|
|
|
For some time, Perl has provided a facility that allows regular expres-
|
|
- sions to recurse (amongst other things). It does this by interpolating
|
|
- Perl code in the expression at run time, and the code can refer to the
|
|
+ sions to recurse (amongst other things). It does this by interpolating
|
|
+ Perl code in the expression at run time, and the code can refer to the
|
|
expression itself. A Perl pattern using code interpolation to solve the
|
|
parentheses problem can be created like this:
|
|
|
|
@@ -6943,201 +6967,201 @@ RECURSIVE PATTERNS
|
|
refers recursively to the pattern in which it appears.
|
|
|
|
Obviously, PCRE cannot support the interpolation of Perl code. Instead,
|
|
- it supports special syntax for recursion of the entire pattern, and
|
|
- also for individual subpattern recursion. After its introduction in
|
|
- PCRE and Python, this kind of recursion was subsequently introduced
|
|
+ it supports special syntax for recursion of the entire pattern, and
|
|
+ also for individual subpattern recursion. After its introduction in
|
|
+ PCRE and Python, this kind of recursion was subsequently introduced
|
|
into Perl at release 5.10.
|
|
|
|
- A special item that consists of (? followed by a number greater than
|
|
- zero and a closing parenthesis is a recursive subroutine call of the
|
|
- subpattern of the given number, provided that it occurs inside that
|
|
- subpattern. (If not, it is a non-recursive subroutine call, which is
|
|
- described in the next section.) The special item (?R) or (?0) is a
|
|
+ A special item that consists of (? followed by a number greater than
|
|
+ zero and a closing parenthesis is a recursive subroutine call of the
|
|
+ subpattern of the given number, provided that it occurs inside that
|
|
+ subpattern. (If not, it is a non-recursive subroutine call, which is
|
|
+ described in the next section.) The special item (?R) or (?0) is a
|
|
recursive call of the entire regular expression.
|
|
|
|
- This PCRE pattern solves the nested parentheses problem (assume the
|
|
+ This PCRE pattern solves the nested parentheses problem (assume the
|
|
PCRE_EXTENDED option is set so that white space is ignored):
|
|
|
|
\( ( [^()]++ | (?R) )* \)
|
|
|
|
- First it matches an opening parenthesis. Then it matches any number of
|
|
- substrings which can either be a sequence of non-parentheses, or a
|
|
- recursive match of the pattern itself (that is, a correctly parenthe-
|
|
+ First it matches an opening parenthesis. Then it matches any number of
|
|
+ substrings which can either be a sequence of non-parentheses, or a
|
|
+ recursive match of the pattern itself (that is, a correctly parenthe-
|
|
sized substring). Finally there is a closing parenthesis. Note the use
|
|
of a possessive quantifier to avoid backtracking into sequences of non-
|
|
parentheses.
|
|
|
|
- If this were part of a larger pattern, you would not want to recurse
|
|
+ If this were part of a larger pattern, you would not want to recurse
|
|
the entire pattern, so instead you could use this:
|
|
|
|
( \( ( [^()]++ | (?1) )* \) )
|
|
|
|
- We have put the pattern into parentheses, and caused the recursion to
|
|
+ We have put the pattern into parentheses, and caused the recursion to
|
|
refer to them instead of the whole pattern.
|
|
|
|
- In a larger pattern, keeping track of parenthesis numbers can be
|
|
- tricky. This is made easier by the use of relative references. Instead
|
|
+ In a larger pattern, keeping track of parenthesis numbers can be
|
|
+ tricky. This is made easier by the use of relative references. Instead
|
|
of (?1) in the pattern above you can write (?-2) to refer to the second
|
|
- most recently opened parentheses preceding the recursion. In other
|
|
- words, a negative number counts capturing parentheses leftwards from
|
|
+ most recently opened parentheses preceding the recursion. In other
|
|
+ words, a negative number counts capturing parentheses leftwards from
|
|
the point at which it is encountered.
|
|
|
|
- It is also possible to refer to subsequently opened parentheses, by
|
|
- writing references such as (?+2). However, these cannot be recursive
|
|
- because the reference is not inside the parentheses that are refer-
|
|
- enced. They are always non-recursive subroutine calls, as described in
|
|
+ It is also possible to refer to subsequently opened parentheses, by
|
|
+ writing references such as (?+2). However, these cannot be recursive
|
|
+ because the reference is not inside the parentheses that are refer-
|
|
+ enced. They are always non-recursive subroutine calls, as described in
|
|
the next section.
|
|
|
|
- An alternative approach is to use named parentheses instead. The Perl
|
|
- syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also
|
|
+ An alternative approach is to use named parentheses instead. The Perl
|
|
+ syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also
|
|
supported. We could rewrite the above example as follows:
|
|
|
|
(?<pn> \( ( [^()]++ | (?&pn) )* \) )
|
|
|
|
- If there is more than one subpattern with the same name, the earliest
|
|
+ If there is more than one subpattern with the same name, the earliest
|
|
one is used.
|
|
|
|
- This particular example pattern that we have been looking at contains
|
|
+ This particular example pattern that we have been looking at contains
|
|
nested unlimited repeats, and so the use of a possessive quantifier for
|
|
matching strings of non-parentheses is important when applying the pat-
|
|
- tern to strings that do not match. For example, when this pattern is
|
|
+ tern to strings that do not match. For example, when this pattern is
|
|
applied to
|
|
|
|
(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
|
|
|
|
- it yields "no match" quickly. However, if a possessive quantifier is
|
|
- not used, the match runs for a very long time indeed because there are
|
|
- so many different ways the + and * repeats can carve up the subject,
|
|
+ it yields "no match" quickly. However, if a possessive quantifier is
|
|
+ not used, the match runs for a very long time indeed because there are
|
|
+ so many different ways the + and * repeats can carve up the subject,
|
|
and all have to be tested before failure can be reported.
|
|
|
|
- At the end of a match, the values of capturing parentheses are those
|
|
- from the outermost level. If you want to obtain intermediate values, a
|
|
- callout function can be used (see below and the pcrecallout documenta-
|
|
+ At the end of a match, the values of capturing parentheses are those
|
|
+ from the outermost level. If you want to obtain intermediate values, a
|
|
+ callout function can be used (see below and the pcrecallout documenta-
|
|
tion). If the pattern above is matched against
|
|
|
|
(ab(cd)ef)
|
|
|
|
- the value for the inner capturing parentheses (numbered 2) is "ef",
|
|
- which is the last value taken on at the top level. If a capturing sub-
|
|
- pattern is not matched at the top level, its final captured value is
|
|
- unset, even if it was (temporarily) set at a deeper level during the
|
|
+ the value for the inner capturing parentheses (numbered 2) is "ef",
|
|
+ which is the last value taken on at the top level. If a capturing sub-
|
|
+ pattern is not matched at the top level, its final captured value is
|
|
+ unset, even if it was (temporarily) set at a deeper level during the
|
|
matching process.
|
|
|
|
- If there are more than 15 capturing parentheses in a pattern, PCRE has
|
|
- to obtain extra memory to store data during a recursion, which it does
|
|
+ If there are more than 15 capturing parentheses in a pattern, PCRE has
|
|
+ to obtain extra memory to store data during a recursion, which it does
|
|
by using pcre_malloc, freeing it via pcre_free afterwards. If no memory
|
|
can be obtained, the match fails with the PCRE_ERROR_NOMEMORY error.
|
|
|
|
- Do not confuse the (?R) item with the condition (R), which tests for
|
|
- recursion. Consider this pattern, which matches text in angle brack-
|
|
- ets, allowing for arbitrary nesting. Only digits are allowed in nested
|
|
- brackets (that is, when recursing), whereas any characters are permit-
|
|
+ Do not confuse the (?R) item with the condition (R), which tests for
|
|
+ recursion. Consider this pattern, which matches text in angle brack-
|
|
+ ets, allowing for arbitrary nesting. Only digits are allowed in nested
|
|
+ brackets (that is, when recursing), whereas any characters are permit-
|
|
ted at the outer level.
|
|
|
|
< (?: (?(R) \d++ | [^<>]*+) | (?R)) * >
|
|
|
|
- In this pattern, (?(R) is the start of a conditional subpattern, with
|
|
- two different alternatives for the recursive and non-recursive cases.
|
|
+ In this pattern, (?(R) is the start of a conditional subpattern, with
|
|
+ two different alternatives for the recursive and non-recursive cases.
|
|
The (?R) item is the actual recursive call.
|
|
|
|
Differences in recursion processing between PCRE and Perl
|
|
|
|
- Recursion processing in PCRE differs from Perl in two important ways.
|
|
- In PCRE (like Python, but unlike Perl), a recursive subpattern call is
|
|
+ Recursion processing in PCRE differs from Perl in two important ways.
|
|
+ In PCRE (like Python, but unlike Perl), a recursive subpattern call is
|
|
always treated as an atomic group. That is, once it has matched some of
|
|
the subject string, it is never re-entered, even if it contains untried
|
|
- alternatives and there is a subsequent matching failure. This can be
|
|
- illustrated by the following pattern, which purports to match a palin-
|
|
- dromic string that contains an odd number of characters (for example,
|
|
+ alternatives and there is a subsequent matching failure. This can be
|
|
+ illustrated by the following pattern, which purports to match a palin-
|
|
+ dromic string that contains an odd number of characters (for example,
|
|
"a", "aba", "abcba", "abcdcba"):
|
|
|
|
^(.|(.)(?1)\2)$
|
|
|
|
The idea is that it either matches a single character, or two identical
|
|
- characters surrounding a sub-palindrome. In Perl, this pattern works;
|
|
- in PCRE it does not if the pattern is longer than three characters.
|
|
+ characters surrounding a sub-palindrome. In Perl, this pattern works;
|
|
+ in PCRE it does not if the pattern is longer than three characters.
|
|
Consider the subject string "abcba":
|
|
|
|
- At the top level, the first character is matched, but as it is not at
|
|
+ At the top level, the first character is matched, but as it is not at
|
|
the end of the string, the first alternative fails; the second alterna-
|
|
tive is taken and the recursion kicks in. The recursive call to subpat-
|
|
- tern 1 successfully matches the next character ("b"). (Note that the
|
|
+ tern 1 successfully matches the next character ("b"). (Note that the
|
|
beginning and end of line tests are not part of the recursion).
|
|
|
|
- Back at the top level, the next character ("c") is compared with what
|
|
- subpattern 2 matched, which was "a". This fails. Because the recursion
|
|
- is treated as an atomic group, there are now no backtracking points,
|
|
- and so the entire match fails. (Perl is able, at this point, to re-
|
|
- enter the recursion and try the second alternative.) However, if the
|
|
+ Back at the top level, the next character ("c") is compared with what
|
|
+ subpattern 2 matched, which was "a". This fails. Because the recursion
|
|
+ is treated as an atomic group, there are now no backtracking points,
|
|
+ and so the entire match fails. (Perl is able, at this point, to re-
|
|
+ enter the recursion and try the second alternative.) However, if the
|
|
pattern is written with the alternatives in the other order, things are
|
|
different:
|
|
|
|
^((.)(?1)\2|.)$
|
|
|
|
- This time, the recursing alternative is tried first, and continues to
|
|
- recurse until it runs out of characters, at which point the recursion
|
|
- fails. But this time we do have another alternative to try at the
|
|
- higher level. That is the big difference: in the previous case the
|
|
+ This time, the recursing alternative is tried first, and continues to
|
|
+ recurse until it runs out of characters, at which point the recursion
|
|
+ fails. But this time we do have another alternative to try at the
|
|
+ higher level. That is the big difference: in the previous case the
|
|
remaining alternative is at a deeper recursion level, which PCRE cannot
|
|
use.
|
|
|
|
- To change the pattern so that it matches all palindromic strings, not
|
|
- just those with an odd number of characters, it is tempting to change
|
|
+ To change the pattern so that it matches all palindromic strings, not
|
|
+ just those with an odd number of characters, it is tempting to change
|
|
the pattern to this:
|
|
|
|
^((.)(?1)\2|.?)$
|
|
|
|
- Again, this works in Perl, but not in PCRE, and for the same reason.
|
|
- When a deeper recursion has matched a single character, it cannot be
|
|
- entered again in order to match an empty string. The solution is to
|
|
- separate the two cases, and write out the odd and even cases as alter-
|
|
+ Again, this works in Perl, but not in PCRE, and for the same reason.
|
|
+ When a deeper recursion has matched a single character, it cannot be
|
|
+ entered again in order to match an empty string. The solution is to
|
|
+ separate the two cases, and write out the odd and even cases as alter-
|
|
natives at the higher level:
|
|
|
|
^(?:((.)(?1)\2|)|((.)(?3)\4|.))
|
|
|
|
- If you want to match typical palindromic phrases, the pattern has to
|
|
+ If you want to match typical palindromic phrases, the pattern has to
|
|
ignore all non-word characters, which can be done like this:
|
|
|
|
^\W*+(?:((.)\W*+(?1)\W*+\2|)|((.)\W*+(?3)\W*+\4|\W*+.\W*+))\W*+$
|
|
|
|
If run with the PCRE_CASELESS option, this pattern matches phrases such
|
|
as "A man, a plan, a canal: Panama!" and it works well in both PCRE and
|
|
- Perl. Note the use of the possessive quantifier *+ to avoid backtrack-
|
|
- ing into sequences of non-word characters. Without this, PCRE takes a
|
|
- great deal longer (ten times or more) to match typical phrases, and
|
|
+ Perl. Note the use of the possessive quantifier *+ to avoid backtrack-
|
|
+ ing into sequences of non-word characters. Without this, PCRE takes a
|
|
+ great deal longer (ten times or more) to match typical phrases, and
|
|
Perl takes so long that you think it has gone into a loop.
|
|
|
|
- WARNING: The palindrome-matching patterns above work only if the sub-
|
|
- ject string does not start with a palindrome that is shorter than the
|
|
- entire string. For example, although "abcba" is correctly matched, if
|
|
- the subject is "ababa", PCRE finds the palindrome "aba" at the start,
|
|
- then fails at top level because the end of the string does not follow.
|
|
- Once again, it cannot jump back into the recursion to try other alter-
|
|
+ WARNING: The palindrome-matching patterns above work only if the sub-
|
|
+ ject string does not start with a palindrome that is shorter than the
|
|
+ entire string. For example, although "abcba" is correctly matched, if
|
|
+ the subject is "ababa", PCRE finds the palindrome "aba" at the start,
|
|
+ then fails at top level because the end of the string does not follow.
|
|
+ Once again, it cannot jump back into the recursion to try other alter-
|
|
natives, so the entire match fails.
|
|
|
|
- The second way in which PCRE and Perl differ in their recursion pro-
|
|
- cessing is in the handling of captured values. In Perl, when a subpat-
|
|
- tern is called recursively or as a subpattern (see the next section),
|
|
- it has no access to any values that were captured outside the recur-
|
|
- sion, whereas in PCRE these values can be referenced. Consider this
|
|
+ The second way in which PCRE and Perl differ in their recursion pro-
|
|
+ cessing is in the handling of captured values. In Perl, when a subpat-
|
|
+ tern is called recursively or as a subpattern (see the next section),
|
|
+ it has no access to any values that were captured outside the recur-
|
|
+ sion, whereas in PCRE these values can be referenced. Consider this
|
|
pattern:
|
|
|
|
^(.)(\1|a(?2))
|
|
|
|
- In PCRE, this pattern matches "bab". The first capturing parentheses
|
|
- match "b", then in the second group, when the back reference \1 fails
|
|
- to match "b", the second alternative matches "a" and then recurses. In
|
|
- the recursion, \1 does now match "b" and so the whole match succeeds.
|
|
- In Perl, the pattern fails to match because inside the recursive call
|
|
+ In PCRE, this pattern matches "bab". The first capturing parentheses
|
|
+ match "b", then in the second group, when the back reference \1 fails
|
|
+ to match "b", the second alternative matches "a" and then recurses. In
|
|
+ the recursion, \1 does now match "b" and so the whole match succeeds.
|
|
+ In Perl, the pattern fails to match because inside the recursive call
|
|
\1 cannot access the externally set value.
|
|
|
|
|
|
SUBPATTERNS AS SUBROUTINES
|
|
|
|
- If the syntax for a recursive subpattern call (either by number or by
|
|
- name) is used outside the parentheses to which it refers, it operates
|
|
- like a subroutine in a programming language. The called subpattern may
|
|
- be defined before or after the reference. A numbered reference can be
|
|
+ If the syntax for a recursive subpattern call (either by number or by
|
|
+ name) is used outside the parentheses to which it refers, it operates
|
|
+ like a subroutine in a programming language. The called subpattern may
|
|
+ be defined before or after the reference. A numbered reference can be
|
|
absolute or relative, as in these examples:
|
|
|
|
(...(absolute)...)...(?2)...
|
|
@@ -7148,79 +7172,79 @@ SUBPATTERNS AS SUBROUTINES
|
|
|
|
(sens|respons)e and \1ibility
|
|
|
|
- matches "sense and sensibility" and "response and responsibility", but
|
|
+ matches "sense and sensibility" and "response and responsibility", but
|
|
not "sense and responsibility". If instead the pattern
|
|
|
|
(sens|respons)e and (?1)ibility
|
|
|
|
- is used, it does match "sense and responsibility" as well as the other
|
|
- two strings. Another example is given in the discussion of DEFINE
|
|
+ is used, it does match "sense and responsibility" as well as the other
|
|
+ two strings. Another example is given in the discussion of DEFINE
|
|
above.
|
|
|
|
- All subroutine calls, whether recursive or not, are always treated as
|
|
- atomic groups. That is, once a subroutine has matched some of the sub-
|
|
+ All subroutine calls, whether recursive or not, are always treated as
|
|
+ atomic groups. That is, once a subroutine has matched some of the sub-
|
|
ject string, it is never re-entered, even if it contains untried alter-
|
|
- natives and there is a subsequent matching failure. Any capturing
|
|
- parentheses that are set during the subroutine call revert to their
|
|
+ natives and there is a subsequent matching failure. Any capturing
|
|
+ parentheses that are set during the subroutine call revert to their
|
|
previous values afterwards.
|
|
|
|
- Processing options such as case-independence are fixed when a subpat-
|
|
- tern is defined, so if it is used as a subroutine, such options cannot
|
|
+ Processing options such as case-independence are fixed when a subpat-
|
|
+ tern is defined, so if it is used as a subroutine, such options cannot
|
|
be changed for different calls. For example, consider this pattern:
|
|
|
|
(abc)(?i:(?-1))
|
|
|
|
- It matches "abcabc". It does not match "abcABC" because the change of
|
|
+ It matches "abcabc". It does not match "abcABC" because the change of
|
|
processing option does not affect the called subpattern.
|
|
|
|
|
|
ONIGURUMA SUBROUTINE SYNTAX
|
|
|
|
- For compatibility with Oniguruma, the non-Perl syntax \g followed by a
|
|
+ For compatibility with Oniguruma, the non-Perl syntax \g followed by a
|
|
name or a number enclosed either in angle brackets or single quotes, is
|
|
- an alternative syntax for referencing a subpattern as a subroutine,
|
|
- possibly recursively. Here are two of the examples used above, rewrit-
|
|
+ an alternative syntax for referencing a subpattern as a subroutine,
|
|
+ possibly recursively. Here are two of the examples used above, rewrit-
|
|
ten using this syntax:
|
|
|
|
(?<pn> \( ( (?>[^()]+) | \g<pn> )* \) )
|
|
(sens|respons)e and \g'1'ibility
|
|
|
|
- PCRE supports an extension to Oniguruma: if a number is preceded by a
|
|
+ PCRE supports an extension to Oniguruma: if a number is preceded by a
|
|
plus or a minus sign it is taken as a relative reference. For example:
|
|
|
|
(abc)(?i:\g<-1>)
|
|
|
|
- Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
|
|
- synonymous. The former is a back reference; the latter is a subroutine
|
|
+ Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
|
|
+ synonymous. The former is a back reference; the latter is a subroutine
|
|
call.
|
|
|
|
|
|
CALLOUTS
|
|
|
|
Perl has a feature whereby using the sequence (?{...}) causes arbitrary
|
|
- Perl code to be obeyed in the middle of matching a regular expression.
|
|
+ Perl code to be obeyed in the middle of matching a regular expression.
|
|
This makes it possible, amongst other things, to extract different sub-
|
|
strings that match the same pair of parentheses when there is a repeti-
|
|
tion.
|
|
|
|
PCRE provides a similar feature, but of course it cannot obey arbitrary
|
|
Perl code. The feature is called "callout". The caller of PCRE provides
|
|
- an external function by putting its entry point in the global variable
|
|
- pcre_callout (8-bit library) or pcre[16|32]_callout (16-bit or 32-bit
|
|
- library). By default, this variable contains NULL, which disables all
|
|
+ an external function by putting its entry point in the global variable
|
|
+ pcre_callout (8-bit library) or pcre[16|32]_callout (16-bit or 32-bit
|
|
+ library). By default, this variable contains NULL, which disables all
|
|
calling out.
|
|
|
|
- Within a regular expression, (?C) indicates the points at which the
|
|
- external function is to be called. If you want to identify different
|
|
- callout points, you can put a number less than 256 after the letter C.
|
|
- The default value is zero. For example, this pattern has two callout
|
|
+ Within a regular expression, (?C) indicates the points at which the
|
|
+ external function is to be called. If you want to identify different
|
|
+ callout points, you can put a number less than 256 after the letter C.
|
|
+ The default value is zero. For example, this pattern has two callout
|
|
points:
|
|
|
|
(?C1)abc(?C2)def
|
|
|
|
- If the PCRE_AUTO_CALLOUT flag is passed to a compiling function, call-
|
|
- outs are automatically installed before each item in the pattern. They
|
|
- are all numbered 255. If there is a conditional group in the pattern
|
|
+ If the PCRE_AUTO_CALLOUT flag is passed to a compiling function, call-
|
|
+ outs are automatically installed before each item in the pattern. They
|
|
+ are all numbered 255. If there is a conditional group in the pattern
|
|
whose condition is an assertion, an additional callout is inserted just
|
|
before the condition. An explicit callout may also be set at this posi-
|
|
tion, as in this example:
|
|
@@ -7230,120 +7254,120 @@ CALLOUTS
|
|
Note that this applies only to assertion conditions, not to other types
|
|
of condition.
|
|
|
|
- During matching, when PCRE reaches a callout point, the external func-
|
|
- tion is called. It is provided with the number of the callout, the
|
|
- position in the pattern, and, optionally, one item of data originally
|
|
- supplied by the caller of the matching function. The callout function
|
|
+ During matching, when PCRE reaches a callout point, the external func-
|
|
+ tion is called. It is provided with the number of the callout, the
|
|
+ position in the pattern, and, optionally, one item of data originally
|
|
+ supplied by the caller of the matching function. The callout function
|
|
may cause matching to proceed, to backtrack, or to fail altogether.
|
|
|
|
- By default, PCRE implements a number of optimizations at compile time
|
|
- and matching time, and one side-effect is that sometimes callouts are
|
|
- skipped. If you need all possible callouts to happen, you need to set
|
|
- options that disable the relevant optimizations. More details, and a
|
|
- complete description of the interface to the callout function, are
|
|
+ By default, PCRE implements a number of optimizations at compile time
|
|
+ and matching time, and one side-effect is that sometimes callouts are
|
|
+ skipped. If you need all possible callouts to happen, you need to set
|
|
+ options that disable the relevant optimizations. More details, and a
|
|
+ complete description of the interface to the callout function, are
|
|
given in the pcrecallout documentation.
|
|
|
|
|
|
BACKTRACKING CONTROL
|
|
|
|
- Perl 5.10 introduced a number of "Special Backtracking Control Verbs",
|
|
- which are still described in the Perl documentation as "experimental
|
|
- and subject to change or removal in a future version of Perl". It goes
|
|
- on to say: "Their usage in production code should be noted to avoid
|
|
- problems during upgrades." The same remarks apply to the PCRE features
|
|
+ Perl 5.10 introduced a number of "Special Backtracking Control Verbs",
|
|
+ which are still described in the Perl documentation as "experimental
|
|
+ and subject to change or removal in a future version of Perl". It goes
|
|
+ on to say: "Their usage in production code should be noted to avoid
|
|
+ problems during upgrades." The same remarks apply to the PCRE features
|
|
described in this section.
|
|
|
|
- The new verbs make use of what was previously invalid syntax: an open-
|
|
+ The new verbs make use of what was previously invalid syntax: an open-
|
|
ing parenthesis followed by an asterisk. They are generally of the form
|
|
- (*VERB) or (*VERB:NAME). Some may take either form, possibly behaving
|
|
- differently depending on whether or not a name is present. A name is
|
|
+ (*VERB) or (*VERB:NAME). Some may take either form, possibly behaving
|
|
+ differently depending on whether or not a name is present. A name is
|
|
any sequence of characters that does not include a closing parenthesis.
|
|
The maximum length of name is 255 in the 8-bit library and 65535 in the
|
|
- 16-bit and 32-bit libraries. If the name is empty, that is, if the
|
|
- closing parenthesis immediately follows the colon, the effect is as if
|
|
- the colon were not there. Any number of these verbs may occur in a
|
|
+ 16-bit and 32-bit libraries. If the name is empty, that is, if the
|
|
+ closing parenthesis immediately follows the colon, the effect is as if
|
|
+ the colon were not there. Any number of these verbs may occur in a
|
|
pattern.
|
|
|
|
- Since these verbs are specifically related to backtracking, most of
|
|
- them can be used only when the pattern is to be matched using one of
|
|
- the traditional matching functions, because these use a backtracking
|
|
- algorithm. With the exception of (*FAIL), which behaves like a failing
|
|
- negative assertion, the backtracking control verbs cause an error if
|
|
+ Since these verbs are specifically related to backtracking, most of
|
|
+ them can be used only when the pattern is to be matched using one of
|
|
+ the traditional matching functions, because these use a backtracking
|
|
+ algorithm. With the exception of (*FAIL), which behaves like a failing
|
|
+ negative assertion, the backtracking control verbs cause an error if
|
|
encountered by a DFA matching function.
|
|
|
|
- The behaviour of these verbs in repeated groups, assertions, and in
|
|
+ The behaviour of these verbs in repeated groups, assertions, and in
|
|
subpatterns called as subroutines (whether or not recursively) is docu-
|
|
mented below.
|
|
|
|
Optimizations that affect backtracking verbs
|
|
|
|
- PCRE contains some optimizations that are used to speed up matching by
|
|
+ PCRE contains some optimizations that are used to speed up matching by
|
|
running some checks at the start of each match attempt. For example, it
|
|
- may know the minimum length of matching subject, or that a particular
|
|
+ may know the minimum length of matching subject, or that a particular
|
|
character must be present. When one of these optimizations bypasses the
|
|
- running of a match, any included backtracking verbs will not, of
|
|
+ running of a match, any included backtracking verbs will not, of
|
|
course, be processed. You can suppress the start-of-match optimizations
|
|
- by setting the PCRE_NO_START_OPTIMIZE option when calling pcre_com-
|
|
+ by setting the PCRE_NO_START_OPTIMIZE option when calling pcre_com-
|
|
pile() or pcre_exec(), or by starting the pattern with (*NO_START_OPT).
|
|
There is more discussion of this option in the section entitled "Option
|
|
bits for pcre_exec()" in the pcreapi documentation.
|
|
|
|
- Experiments with Perl suggest that it too has similar optimizations,
|
|
+ Experiments with Perl suggest that it too has similar optimizations,
|
|
sometimes leading to anomalous results.
|
|
|
|
Verbs that act immediately
|
|
|
|
- The following verbs act as soon as they are encountered. They may not
|
|
+ The following verbs act as soon as they are encountered. They may not
|
|
be followed by a name.
|
|
|
|
(*ACCEPT)
|
|
|
|
- This verb causes the match to end successfully, skipping the remainder
|
|
- of the pattern. However, when it is inside a subpattern that is called
|
|
- as a subroutine, only that subpattern is ended successfully. Matching
|
|
+ This verb causes the match to end successfully, skipping the remainder
|
|
+ of the pattern. However, when it is inside a subpattern that is called
|
|
+ as a subroutine, only that subpattern is ended successfully. Matching
|
|
then continues at the outer level. If (*ACCEPT) in triggered in a posi-
|
|
- tive assertion, the assertion succeeds; in a negative assertion, the
|
|
+ tive assertion, the assertion succeeds; in a negative assertion, the
|
|
assertion fails.
|
|
|
|
- If (*ACCEPT) is inside capturing parentheses, the data so far is cap-
|
|
+ If (*ACCEPT) is inside capturing parentheses, the data so far is cap-
|
|
tured. For example:
|
|
|
|
A((?:A|B(*ACCEPT)|C)D)
|
|
|
|
- This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap-
|
|
+ This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap-
|
|
tured by the outer parentheses.
|
|
|
|
(*FAIL) or (*F)
|
|
|
|
- This verb causes a matching failure, forcing backtracking to occur. It
|
|
- is equivalent to (?!) but easier to read. The Perl documentation notes
|
|
- that it is probably useful only when combined with (?{}) or (??{}).
|
|
- Those are, of course, Perl features that are not present in PCRE. The
|
|
- nearest equivalent is the callout feature, as for example in this pat-
|
|
+ This verb causes a matching failure, forcing backtracking to occur. It
|
|
+ is equivalent to (?!) but easier to read. The Perl documentation notes
|
|
+ that it is probably useful only when combined with (?{}) or (??{}).
|
|
+ Those are, of course, Perl features that are not present in PCRE. The
|
|
+ nearest equivalent is the callout feature, as for example in this pat-
|
|
tern:
|
|
|
|
a+(?C)(*FAIL)
|
|
|
|
- A match with the string "aaaa" always fails, but the callout is taken
|
|
+ A match with the string "aaaa" always fails, but the callout is taken
|
|
before each backtrack happens (in this example, 10 times).
|
|
|
|
Recording which path was taken
|
|
|
|
- There is one verb whose main purpose is to track how a match was
|
|
- arrived at, though it also has a secondary use in conjunction with
|
|
+ There is one verb whose main purpose is to track how a match was
|
|
+ arrived at, though it also has a secondary use in conjunction with
|
|
advancing the match starting point (see (*SKIP) below).
|
|
|
|
(*MARK:NAME) or (*:NAME)
|
|
|
|
- A name is always required with this verb. There may be as many
|
|
- instances of (*MARK) as you like in a pattern, and their names do not
|
|
+ A name is always required with this verb. There may be as many
|
|
+ instances of (*MARK) as you like in a pattern, and their names do not
|
|
have to be unique.
|
|
|
|
- When a match succeeds, the name of the last-encountered (*MARK:NAME),
|
|
- (*PRUNE:NAME), or (*THEN:NAME) on the matching path is passed back to
|
|
- the caller as described in the section entitled "Extra data for
|
|
- pcre_exec()" in the pcreapi documentation. Here is an example of
|
|
- pcretest output, where the /K modifier requests the retrieval and out-
|
|
+ When a match succeeds, the name of the last-encountered (*MARK:NAME),
|
|
+ (*PRUNE:NAME), or (*THEN:NAME) on the matching path is passed back to
|
|
+ the caller as described in the section entitled "Extra data for
|
|
+ pcre_exec()" in the pcreapi documentation. Here is an example of
|
|
+ pcretest output, where the /K modifier requests the retrieval and out-
|
|
putting of (*MARK) data:
|
|
|
|
re> /X(*MARK:A)Y|X(*MARK:B)Z/K
|
|
@@ -7355,73 +7379,73 @@ BACKTRACKING CONTROL
|
|
MK: B
|
|
|
|
The (*MARK) name is tagged with "MK:" in this output, and in this exam-
|
|
- ple it indicates which of the two alternatives matched. This is a more
|
|
- efficient way of obtaining this information than putting each alterna-
|
|
+ ple it indicates which of the two alternatives matched. This is a more
|
|
+ efficient way of obtaining this information than putting each alterna-
|
|
tive in its own capturing parentheses.
|
|
|
|
- If a verb with a name is encountered in a positive assertion that is
|
|
- true, the name is recorded and passed back if it is the last-encoun-
|
|
+ If a verb with a name is encountered in a positive assertion that is
|
|
+ true, the name is recorded and passed back if it is the last-encoun-
|
|
tered. This does not happen for negative assertions or failing positive
|
|
assertions.
|
|
|
|
- After a partial match or a failed match, the last encountered name in
|
|
+ After a partial match or a failed match, the last encountered name in
|
|
the entire match process is returned. For example:
|
|
|
|
re> /X(*MARK:A)Y|X(*MARK:B)Z/K
|
|
data> XP
|
|
No match, mark = B
|
|
|
|
- Note that in this unanchored example the mark is retained from the
|
|
+ Note that in this unanchored example the mark is retained from the
|
|
match attempt that started at the letter "X" in the subject. Subsequent
|
|
match attempts starting at "P" and then with an empty string do not get
|
|
as far as the (*MARK) item, but nevertheless do not reset it.
|
|
|
|
- If you are interested in (*MARK) values after failed matches, you
|
|
- should probably set the PCRE_NO_START_OPTIMIZE option (see above) to
|
|
+ If you are interested in (*MARK) values after failed matches, you
|
|
+ should probably set the PCRE_NO_START_OPTIMIZE option (see above) to
|
|
ensure that the match is always attempted.
|
|
|
|
Verbs that act after backtracking
|
|
|
|
The following verbs do nothing when they are encountered. Matching con-
|
|
- tinues with what follows, but if there is no subsequent match, causing
|
|
- a backtrack to the verb, a failure is forced. That is, backtracking
|
|
- cannot pass to the left of the verb. However, when one of these verbs
|
|
+ tinues with what follows, but if there is no subsequent match, causing
|
|
+ a backtrack to the verb, a failure is forced. That is, backtracking
|
|
+ cannot pass to the left of the verb. However, when one of these verbs
|
|
appears inside an atomic group or an assertion that is true, its effect
|
|
- is confined to that group, because once the group has been matched,
|
|
- there is never any backtracking into it. In this situation, backtrack-
|
|
- ing can "jump back" to the left of the entire atomic group or asser-
|
|
- tion. (Remember also, as stated above, that this localization also
|
|
+ is confined to that group, because once the group has been matched,
|
|
+ there is never any backtracking into it. In this situation, backtrack-
|
|
+ ing can "jump back" to the left of the entire atomic group or asser-
|
|
+ tion. (Remember also, as stated above, that this localization also
|
|
applies in subroutine calls.)
|
|
|
|
- These verbs differ in exactly what kind of failure occurs when back-
|
|
- tracking reaches them. The behaviour described below is what happens
|
|
- when the verb is not in a subroutine or an assertion. Subsequent sec-
|
|
+ These verbs differ in exactly what kind of failure occurs when back-
|
|
+ tracking reaches them. The behaviour described below is what happens
|
|
+ when the verb is not in a subroutine or an assertion. Subsequent sec-
|
|
tions cover these special cases.
|
|
|
|
(*COMMIT)
|
|
|
|
- This verb, which may not be followed by a name, causes the whole match
|
|
+ This verb, which may not be followed by a name, causes the whole match
|
|
to fail outright if there is a later matching failure that causes back-
|
|
- tracking to reach it. Even if the pattern is unanchored, no further
|
|
+ tracking to reach it. Even if the pattern is unanchored, no further
|
|
attempts to find a match by advancing the starting point take place. If
|
|
- (*COMMIT) is the only backtracking verb that is encountered, once it
|
|
+ (*COMMIT) is the only backtracking verb that is encountered, once it
|
|
has been passed pcre_exec() is committed to finding a match at the cur-
|
|
rent starting point, or not at all. For example:
|
|
|
|
a+(*COMMIT)b
|
|
|
|
- This matches "xxaab" but not "aacaab". It can be thought of as a kind
|
|
+ This matches "xxaab" but not "aacaab". It can be thought of as a kind
|
|
of dynamic anchor, or "I've started, so I must finish." The name of the
|
|
- most recently passed (*MARK) in the path is passed back when (*COMMIT)
|
|
+ most recently passed (*MARK) in the path is passed back when (*COMMIT)
|
|
forces a match failure.
|
|
|
|
- If there is more than one backtracking verb in a pattern, a different
|
|
- one that follows (*COMMIT) may be triggered first, so merely passing
|
|
+ If there is more than one backtracking verb in a pattern, a different
|
|
+ one that follows (*COMMIT) may be triggered first, so merely passing
|
|
(*COMMIT) during a match does not always guarantee that a match must be
|
|
at this starting point.
|
|
|
|
- Note that (*COMMIT) at the start of a pattern is not the same as an
|
|
- anchor, unless PCRE's start-of-match optimizations are turned off, as
|
|
+ Note that (*COMMIT) at the start of a pattern is not the same as an
|
|
+ anchor, unless PCRE's start-of-match optimizations are turned off, as
|
|
shown in this output from pcretest:
|
|
|
|
re> /(*COMMIT)abc/
|
|
@@ -7432,207 +7456,207 @@ BACKTRACKING CONTROL
|
|
|
|
For this pattern, PCRE knows that any match must start with "a", so the
|
|
optimization skips along the subject to "a" before applying the pattern
|
|
- to the first set of data. The match attempt then succeeds. In the sec-
|
|
- ond set of data, the escape sequence \Y is interpreted by the pcretest
|
|
- program. It causes the PCRE_NO_START_OPTIMIZE option to be set when
|
|
+ to the first set of data. The match attempt then succeeds. In the sec-
|
|
+ ond set of data, the escape sequence \Y is interpreted by the pcretest
|
|
+ program. It causes the PCRE_NO_START_OPTIMIZE option to be set when
|
|
pcre_exec() is called. This disables the optimization that skips along
|
|
to the first character. The pattern is now applied starting at "x", and
|
|
- so the (*COMMIT) causes the match to fail without trying any other
|
|
+ so the (*COMMIT) causes the match to fail without trying any other
|
|
starting points.
|
|
|
|
(*PRUNE) or (*PRUNE:NAME)
|
|
|
|
- This verb causes the match to fail at the current starting position in
|
|
+ This verb causes the match to fail at the current starting position in
|
|
the subject if there is a later matching failure that causes backtrack-
|
|
- ing to reach it. If the pattern is unanchored, the normal "bumpalong"
|
|
- advance to the next starting character then happens. Backtracking can
|
|
- occur as usual to the left of (*PRUNE), before it is reached, or when
|
|
- matching to the right of (*PRUNE), but if there is no match to the
|
|
- right, backtracking cannot cross (*PRUNE). In simple cases, the use of
|
|
- (*PRUNE) is just an alternative to an atomic group or possessive quan-
|
|
+ ing to reach it. If the pattern is unanchored, the normal "bumpalong"
|
|
+ advance to the next starting character then happens. Backtracking can
|
|
+ occur as usual to the left of (*PRUNE), before it is reached, or when
|
|
+ matching to the right of (*PRUNE), but if there is no match to the
|
|
+ right, backtracking cannot cross (*PRUNE). In simple cases, the use of
|
|
+ (*PRUNE) is just an alternative to an atomic group or possessive quan-
|
|
tifier, but there are some uses of (*PRUNE) that cannot be expressed in
|
|
- any other way. In an anchored pattern (*PRUNE) has the same effect as
|
|
+ any other way. In an anchored pattern (*PRUNE) has the same effect as
|
|
(*COMMIT).
|
|
|
|
The behaviour of (*PRUNE:NAME) is the not the same as
|
|
- (*MARK:NAME)(*PRUNE). It is like (*MARK:NAME) in that the name is
|
|
- remembered for passing back to the caller. However, (*SKIP:NAME)
|
|
+ (*MARK:NAME)(*PRUNE). It is like (*MARK:NAME) in that the name is
|
|
+ remembered for passing back to the caller. However, (*SKIP:NAME)
|
|
searches only for names set with (*MARK).
|
|
|
|
(*SKIP)
|
|
|
|
- This verb, when given without a name, is like (*PRUNE), except that if
|
|
- the pattern is unanchored, the "bumpalong" advance is not to the next
|
|
+ This verb, when given without a name, is like (*PRUNE), except that if
|
|
+ the pattern is unanchored, the "bumpalong" advance is not to the next
|
|
character, but to the position in the subject where (*SKIP) was encoun-
|
|
- tered. (*SKIP) signifies that whatever text was matched leading up to
|
|
+ tered. (*SKIP) signifies that whatever text was matched leading up to
|
|
it cannot be part of a successful match. Consider:
|
|
|
|
a+(*SKIP)b
|
|
|
|
- If the subject is "aaaac...", after the first match attempt fails
|
|
- (starting at the first character in the string), the starting point
|
|
+ If the subject is "aaaac...", after the first match attempt fails
|
|
+ (starting at the first character in the string), the starting point
|
|
skips on to start the next attempt at "c". Note that a possessive quan-
|
|
- tifer does not have the same effect as this example; although it would
|
|
- suppress backtracking during the first match attempt, the second
|
|
- attempt would start at the second character instead of skipping on to
|
|
+ tifer does not have the same effect as this example; although it would
|
|
+ suppress backtracking during the first match attempt, the second
|
|
+ attempt would start at the second character instead of skipping on to
|
|
"c".
|
|
|
|
(*SKIP:NAME)
|
|
|
|
When (*SKIP) has an associated name, its behaviour is modified. When it
|
|
is triggered, the previous path through the pattern is searched for the
|
|
- most recent (*MARK) that has the same name. If one is found, the
|
|
+ most recent (*MARK) that has the same name. If one is found, the
|
|
"bumpalong" advance is to the subject position that corresponds to that
|
|
(*MARK) instead of to where (*SKIP) was encountered. If no (*MARK) with
|
|
a matching name is found, the (*SKIP) is ignored.
|
|
|
|
- Note that (*SKIP:NAME) searches only for names set by (*MARK:NAME). It
|
|
+ Note that (*SKIP:NAME) searches only for names set by (*MARK:NAME). It
|
|
ignores names that are set by (*PRUNE:NAME) or (*THEN:NAME).
|
|
|
|
(*THEN) or (*THEN:NAME)
|
|
|
|
- This verb causes a skip to the next innermost alternative when back-
|
|
- tracking reaches it. That is, it cancels any further backtracking
|
|
- within the current alternative. Its name comes from the observation
|
|
+ This verb causes a skip to the next innermost alternative when back-
|
|
+ tracking reaches it. That is, it cancels any further backtracking
|
|
+ within the current alternative. Its name comes from the observation
|
|
that it can be used for a pattern-based if-then-else block:
|
|
|
|
( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
|
|
|
|
- If the COND1 pattern matches, FOO is tried (and possibly further items
|
|
- after the end of the group if FOO succeeds); on failure, the matcher
|
|
- skips to the second alternative and tries COND2, without backtracking
|
|
- into COND1. If that succeeds and BAR fails, COND3 is tried. If subse-
|
|
- quently BAZ fails, there are no more alternatives, so there is a back-
|
|
- track to whatever came before the entire group. If (*THEN) is not
|
|
+ If the COND1 pattern matches, FOO is tried (and possibly further items
|
|
+ after the end of the group if FOO succeeds); on failure, the matcher
|
|
+ skips to the second alternative and tries COND2, without backtracking
|
|
+ into COND1. If that succeeds and BAR fails, COND3 is tried. If subse-
|
|
+ quently BAZ fails, there are no more alternatives, so there is a back-
|
|
+ track to whatever came before the entire group. If (*THEN) is not
|
|
inside an alternation, it acts like (*PRUNE).
|
|
|
|
- The behaviour of (*THEN:NAME) is the not the same as
|
|
- (*MARK:NAME)(*THEN). It is like (*MARK:NAME) in that the name is
|
|
- remembered for passing back to the caller. However, (*SKIP:NAME)
|
|
+ The behaviour of (*THEN:NAME) is the not the same as
|
|
+ (*MARK:NAME)(*THEN). It is like (*MARK:NAME) in that the name is
|
|
+ remembered for passing back to the caller. However, (*SKIP:NAME)
|
|
searches only for names set with (*MARK).
|
|
|
|
- A subpattern that does not contain a | character is just a part of the
|
|
- enclosing alternative; it is not a nested alternation with only one
|
|
- alternative. The effect of (*THEN) extends beyond such a subpattern to
|
|
- the enclosing alternative. Consider this pattern, where A, B, etc. are
|
|
- complex pattern fragments that do not contain any | characters at this
|
|
+ A subpattern that does not contain a | character is just a part of the
|
|
+ enclosing alternative; it is not a nested alternation with only one
|
|
+ alternative. The effect of (*THEN) extends beyond such a subpattern to
|
|
+ the enclosing alternative. Consider this pattern, where A, B, etc. are
|
|
+ complex pattern fragments that do not contain any | characters at this
|
|
level:
|
|
|
|
A (B(*THEN)C) | D
|
|
|
|
- If A and B are matched, but there is a failure in C, matching does not
|
|
+ If A and B are matched, but there is a failure in C, matching does not
|
|
backtrack into A; instead it moves to the next alternative, that is, D.
|
|
- However, if the subpattern containing (*THEN) is given an alternative,
|
|
+ However, if the subpattern containing (*THEN) is given an alternative,
|
|
it behaves differently:
|
|
|
|
A (B(*THEN)C | (*FAIL)) | D
|
|
|
|
- The effect of (*THEN) is now confined to the inner subpattern. After a
|
|
+ The effect of (*THEN) is now confined to the inner subpattern. After a
|
|
failure in C, matching moves to (*FAIL), which causes the whole subpat-
|
|
- tern to fail because there are no more alternatives to try. In this
|
|
+ tern to fail because there are no more alternatives to try. In this
|
|
case, matching does now backtrack into A.
|
|
|
|
- Note that a conditional subpattern is not considered as having two
|
|
- alternatives, because only one is ever used. In other words, the |
|
|
+ Note that a conditional subpattern is not considered as having two
|
|
+ alternatives, because only one is ever used. In other words, the |
|
|
character in a conditional subpattern has a different meaning. Ignoring
|
|
white space, consider:
|
|
|
|
^.*? (?(?=a) a | b(*THEN)c )
|
|
|
|
- If the subject is "ba", this pattern does not match. Because .*? is
|
|
- ungreedy, it initially matches zero characters. The condition (?=a)
|
|
- then fails, the character "b" is matched, but "c" is not. At this
|
|
- point, matching does not backtrack to .*? as might perhaps be expected
|
|
- from the presence of the | character. The conditional subpattern is
|
|
+ If the subject is "ba", this pattern does not match. Because .*? is
|
|
+ ungreedy, it initially matches zero characters. The condition (?=a)
|
|
+ then fails, the character "b" is matched, but "c" is not. At this
|
|
+ point, matching does not backtrack to .*? as might perhaps be expected
|
|
+ from the presence of the | character. The conditional subpattern is
|
|
part of the single alternative that comprises the whole pattern, and so
|
|
- the match fails. (If there was a backtrack into .*?, allowing it to
|
|
+ the match fails. (If there was a backtrack into .*?, allowing it to
|
|
match "b", the match would succeed.)
|
|
|
|
- The verbs just described provide four different "strengths" of control
|
|
+ The verbs just described provide four different "strengths" of control
|
|
when subsequent matching fails. (*THEN) is the weakest, carrying on the
|
|
- match at the next alternative. (*PRUNE) comes next, failing the match
|
|
- at the current starting position, but allowing an advance to the next
|
|
- character (for an unanchored pattern). (*SKIP) is similar, except that
|
|
+ match at the next alternative. (*PRUNE) comes next, failing the match
|
|
+ at the current starting position, but allowing an advance to the next
|
|
+ character (for an unanchored pattern). (*SKIP) is similar, except that
|
|
the advance may be more than one character. (*COMMIT) is the strongest,
|
|
causing the entire match to fail.
|
|
|
|
More than one backtracking verb
|
|
|
|
- If more than one backtracking verb is present in a pattern, the one
|
|
- that is backtracked onto first acts. For example, consider this pat-
|
|
+ If more than one backtracking verb is present in a pattern, the one
|
|
+ that is backtracked onto first acts. For example, consider this pat-
|
|
tern, where A, B, etc. are complex pattern fragments:
|
|
|
|
(A(*COMMIT)B(*THEN)C|ABD)
|
|
|
|
- If A matches but B fails, the backtrack to (*COMMIT) causes the entire
|
|
+ If A matches but B fails, the backtrack to (*COMMIT) causes the entire
|
|
match to fail. However, if A and B match, but C fails, the backtrack to
|
|
- (*THEN) causes the next alternative (ABD) to be tried. This behaviour
|
|
- is consistent, but is not always the same as Perl's. It means that if
|
|
- two or more backtracking verbs appear in succession, all the the last
|
|
+ (*THEN) causes the next alternative (ABD) to be tried. This behaviour
|
|
+ is consistent, but is not always the same as Perl's. It means that if
|
|
+ two or more backtracking verbs appear in succession, all the the last
|
|
of them has no effect. Consider this example:
|
|
|
|
...(*COMMIT)(*PRUNE)...
|
|
|
|
If there is a matching failure to the right, backtracking onto (*PRUNE)
|
|
- causes it to be triggered, and its action is taken. There can never be
|
|
+ causes it to be triggered, and its action is taken. There can never be
|
|
a backtrack onto (*COMMIT).
|
|
|
|
Backtracking verbs in repeated groups
|
|
|
|
- PCRE differs from Perl in its handling of backtracking verbs in
|
|
+ PCRE differs from Perl in its handling of backtracking verbs in
|
|
repeated groups. For example, consider:
|
|
|
|
/(a(*COMMIT)b)+ac/
|
|
|
|
- If the subject is "abac", Perl matches, but PCRE fails because the
|
|
+ If the subject is "abac", Perl matches, but PCRE fails because the
|
|
(*COMMIT) in the second repeat of the group acts.
|
|
|
|
Backtracking verbs in assertions
|
|
|
|
- (*FAIL) in an assertion has its normal effect: it forces an immediate
|
|
+ (*FAIL) in an assertion has its normal effect: it forces an immediate
|
|
backtrack.
|
|
|
|
(*ACCEPT) in a positive assertion causes the assertion to succeed with-
|
|
- out any further processing. In a negative assertion, (*ACCEPT) causes
|
|
+ out any further processing. In a negative assertion, (*ACCEPT) causes
|
|
the assertion to fail without any further processing.
|
|
|
|
- The other backtracking verbs are not treated specially if they appear
|
|
- in a positive assertion. In particular, (*THEN) skips to the next
|
|
- alternative in the innermost enclosing group that has alternations,
|
|
+ The other backtracking verbs are not treated specially if they appear
|
|
+ in a positive assertion. In particular, (*THEN) skips to the next
|
|
+ alternative in the innermost enclosing group that has alternations,
|
|
whether or not this is within the assertion.
|
|
|
|
- Negative assertions are, however, different, in order to ensure that
|
|
- changing a positive assertion into a negative assertion changes its
|
|
+ Negative assertions are, however, different, in order to ensure that
|
|
+ changing a positive assertion into a negative assertion changes its
|
|
result. Backtracking into (*COMMIT), (*SKIP), or (*PRUNE) causes a neg-
|
|
ative assertion to be true, without considering any further alternative
|
|
branches in the assertion. Backtracking into (*THEN) causes it to skip
|
|
- to the next enclosing alternative within the assertion (the normal be-
|
|
- haviour), but if the assertion does not have such an alternative,
|
|
+ to the next enclosing alternative within the assertion (the normal be-
|
|
+ haviour), but if the assertion does not have such an alternative,
|
|
(*THEN) behaves like (*PRUNE).
|
|
|
|
Backtracking verbs in subroutines
|
|
|
|
- These behaviours occur whether or not the subpattern is called recur-
|
|
+ These behaviours occur whether or not the subpattern is called recur-
|
|
sively. Perl's treatment of subroutines is different in some cases.
|
|
|
|
- (*FAIL) in a subpattern called as a subroutine has its normal effect:
|
|
+ (*FAIL) in a subpattern called as a subroutine has its normal effect:
|
|
it forces an immediate backtrack.
|
|
|
|
- (*ACCEPT) in a subpattern called as a subroutine causes the subroutine
|
|
- match to succeed without any further processing. Matching then contin-
|
|
+ (*ACCEPT) in a subpattern called as a subroutine causes the subroutine
|
|
+ match to succeed without any further processing. Matching then contin-
|
|
ues after the subroutine call.
|
|
|
|
(*COMMIT), (*SKIP), and (*PRUNE) in a subpattern called as a subroutine
|
|
cause the subroutine match to fail.
|
|
|
|
- (*THEN) skips to the next alternative in the innermost enclosing group
|
|
- within the subpattern that has alternatives. If there is no such group
|
|
+ (*THEN) skips to the next alternative in the innermost enclosing group
|
|
+ within the subpattern that has alternatives. If there is no such group
|
|
within the subpattern, (*THEN) causes the subroutine match to fail.
|
|
|
|
|
|
SEE ALSO
|
|
|
|
- pcreapi(3), pcrecallout(3), pcrematching(3), pcresyntax(3), pcre(3),
|
|
+ pcreapi(3), pcrecallout(3), pcrematching(3), pcresyntax(3), pcre(3),
|
|
pcre16(3), pcre32(3).
|
|
|
|
|
|
@@ -7645,8 +7669,8 @@ AUTHOR
|
|
|
|
REVISION
|
|
|
|
- Last updated: 08 January 2014
|
|
- Copyright (c) 1997-2014 University of Cambridge.
|
|
+ Last updated: 14 June 2015
|
|
+ Copyright (c) 1997-2015 University of Cambridge.
|
|
------------------------------------------------------------------------------
|
|
|
|
|
|
diff --git a/ext/pcre/pcrelib/pcre.h b/ext/pcre/pcrelib/pcre.h
|
|
index 58ed46a..bf6351f 100644
|
|
--- a/ext/pcre/pcrelib/pcre.h
|
|
+++ b/ext/pcre/pcrelib/pcre.h
|
|
@@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
|
|
/* The current PCRE version information. */
|
|
|
|
#define PCRE_MAJOR 8
|
|
-#define PCRE_MINOR 37
|
|
+#define PCRE_MINOR 38
|
|
#define PCRE_PRERELEASE
|
|
-#define PCRE_DATE 2015-04-28
|
|
+#define PCRE_DATE 2015-11-23
|
|
|
|
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
|
imported have to be identified as such. When building PCRE, the appropriate
|
|
diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c
|
|
index 0efad26..4d3b313 100644
|
|
--- a/ext/pcre/pcrelib/pcre_compile.c
|
|
+++ b/ext/pcre/pcrelib/pcre_compile.c
|
|
@@ -174,7 +174,7 @@ static const short int escapes[] = {
|
|
-ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
|
|
CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
|
|
CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
|
|
- CHAR_GRAVE_ACCENT, 7,
|
|
+ CHAR_GRAVE_ACCENT, ESC_a,
|
|
-ESC_b, 0,
|
|
-ESC_d, ESC_e,
|
|
ESC_f, 0,
|
|
@@ -202,9 +202,9 @@ static const short int escapes[] = {
|
|
/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
|
|
/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
|
|
-/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
|
|
+/* 80 */ 0, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
|
|
/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
|
|
-/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
|
|
+/* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p,
|
|
/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
|
|
/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
|
|
/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
|
|
@@ -219,6 +219,12 @@ static const short int escapes[] = {
|
|
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
|
|
};
|
|
+
|
|
+/* We also need a table of characters that may follow \c in an EBCDIC
|
|
+environment for characters 0-31. */
|
|
+
|
|
+static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
|
|
+
|
|
#endif
|
|
|
|
|
|
@@ -458,7 +464,7 @@ static const char error_texts[] =
|
|
"range out of order in character class\0"
|
|
"nothing to repeat\0"
|
|
/* 10 */
|
|
- "operand of unlimited repeat could match the empty string\0" /** DEAD **/
|
|
+ "internal error: invalid forward reference offset\0"
|
|
"internal error: unexpected repeat\0"
|
|
"unrecognized character after (? or (?-\0"
|
|
"POSIX named classes are supported only within a class\0"
|
|
@@ -527,7 +533,11 @@ static const char error_texts[] =
|
|
"different names for subpatterns of the same number are not allowed\0"
|
|
"(*MARK) must have an argument\0"
|
|
"this version of PCRE is not compiled with Unicode property support\0"
|
|
+#ifndef EBCDIC
|
|
"\\c must be followed by an ASCII character\0"
|
|
+#else
|
|
+ "\\c must be followed by a letter or one of [\\]^_?\0"
|
|
+#endif
|
|
"\\k is not followed by a braced, angle-bracketed, or quoted name\0"
|
|
/* 70 */
|
|
"internal error: unknown opcode in find_fixedlength()\0"
|
|
@@ -1425,7 +1435,16 @@ else
|
|
c ^= 0x40;
|
|
#else /* EBCDIC coding */
|
|
if (c >= CHAR_a && c <= CHAR_z) c += 64;
|
|
- c ^= 0xC0;
|
|
+ if (c == CHAR_QUESTION_MARK)
|
|
+ c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
|
|
+ else
|
|
+ {
|
|
+ for (i = 0; i < 32; i++)
|
|
+ {
|
|
+ if (c == ebcdic_escape_c[i]) break;
|
|
+ }
|
|
+ if (i < 32) c = i; else *errorcodeptr = ERR68;
|
|
+ }
|
|
#endif
|
|
break;
|
|
|
|
@@ -1799,7 +1818,7 @@ for (;;)
|
|
case OP_ASSERTBACK:
|
|
case OP_ASSERTBACK_NOT:
|
|
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
|
- cc += PRIV(OP_lengths)[*cc];
|
|
+ cc += 1 + LINK_SIZE;
|
|
break;
|
|
|
|
/* Skip over things that don't match chars */
|
|
@@ -2487,7 +2506,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
|
|
if (c == OP_BRA || c == OP_BRAPOS ||
|
|
c == OP_CBRA || c == OP_CBRAPOS ||
|
|
c == OP_ONCE || c == OP_ONCE_NC ||
|
|
- c == OP_COND)
|
|
+ c == OP_COND || c == OP_SCOND)
|
|
{
|
|
BOOL empty_branch;
|
|
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
|
|
@@ -3886,11 +3905,11 @@ didn't consider this to be a POSIX class. Likewise for [:1234:].
|
|
The problem in trying to be exactly like Perl is in the handling of escapes. We
|
|
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
|
|
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
|
|
-below handles the special case of \], but does not try to do any other escape
|
|
-processing. This makes it different from Perl for cases such as [:l\ower:]
|
|
-where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
|
|
-"l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
|
|
-I think.
|
|
+below handles the special cases \\ and \], but does not try to do any other
|
|
+escape processing. This makes it different from Perl for cases such as
|
|
+[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
|
|
+not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
|
|
+when Perl does, I think.
|
|
|
|
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
|
|
It seems that the appearance of a nested POSIX class supersedes an apparent
|
|
@@ -3917,21 +3936,16 @@ pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
|
|
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
|
|
for (++ptr; *ptr != CHAR_NULL; ptr++)
|
|
{
|
|
- if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
|
|
+ if (*ptr == CHAR_BACKSLASH &&
|
|
+ (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
|
|
+ ptr[1] == CHAR_BACKSLASH))
|
|
ptr++;
|
|
- else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
|
|
- else
|
|
+ else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
|
|
+ *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
|
|
+ else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
|
|
{
|
|
- if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
|
|
- {
|
|
- *endptr = ptr;
|
|
- return TRUE;
|
|
- }
|
|
- if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
|
|
- (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
|
|
- ptr[1] == CHAR_EQUALS_SIGN) &&
|
|
- check_posix_syntax(ptr, endptr))
|
|
- return FALSE;
|
|
+ *endptr = ptr;
|
|
+ return TRUE;
|
|
}
|
|
}
|
|
return FALSE;
|
|
@@ -3985,11 +3999,12 @@ have their offsets adjusted. That one of the jobs of this function. Before it
|
|
is called, the partially compiled regex must be temporarily terminated with
|
|
OP_END.
|
|
|
|
-This function has been extended with the possibility of forward references for
|
|
-recursions and subroutine calls. It must also check the list of such references
|
|
-for the group we are dealing with. If it finds that one of the recursions in
|
|
-the current group is on this list, it adjusts the offset in the list, not the
|
|
-value in the reference (which is a group number).
|
|
+This function has been extended to cope with forward references for recursions
|
|
+and subroutine calls. It must check the list of such references for the
|
|
+group we are dealing with. If it finds that one of the recursions in the
|
|
+current group is on this list, it does not adjust the value in the reference
|
|
+(which is a group number). After the group has been scanned, all the offsets in
|
|
+the forward reference list for the group are adjusted.
|
|
|
|
Arguments:
|
|
group points to the start of the group
|
|
@@ -4005,29 +4020,21 @@ static void
|
|
adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
|
|
size_t save_hwm_offset)
|
|
{
|
|
+int offset;
|
|
+pcre_uchar *hc;
|
|
pcre_uchar *ptr = group;
|
|
|
|
while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
|
|
{
|
|
- int offset;
|
|
- pcre_uchar *hc;
|
|
-
|
|
- /* See if this recursion is on the forward reference list. If so, adjust the
|
|
- reference. */
|
|
-
|
|
for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
|
|
hc += LINK_SIZE)
|
|
{
|
|
offset = (int)GET(hc, 0);
|
|
- if (cd->start_code + offset == ptr + 1)
|
|
- {
|
|
- PUT(hc, 0, offset + adjust);
|
|
- break;
|
|
- }
|
|
+ if (cd->start_code + offset == ptr + 1) break;
|
|
}
|
|
|
|
- /* Otherwise, adjust the recursion offset if it's after the start of this
|
|
- group. */
|
|
+ /* If we have not found this recursion on the forward reference list, adjust
|
|
+ the recursion's offset if it's after the start of this group. */
|
|
|
|
if (hc >= cd->hwm)
|
|
{
|
|
@@ -4037,6 +4044,15 @@ while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
|
|
|
|
ptr += 1 + LINK_SIZE;
|
|
}
|
|
+
|
|
+/* Now adjust all forward reference offsets for the group. */
|
|
+
|
|
+for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
|
|
+ hc += LINK_SIZE)
|
|
+ {
|
|
+ offset = (int)GET(hc, 0);
|
|
+ PUT(hc, 0, offset + adjust);
|
|
+ }
|
|
}
|
|
|
|
|
|
@@ -4465,7 +4481,7 @@ const pcre_uchar *tempptr;
|
|
const pcre_uchar *nestptr = NULL;
|
|
pcre_uchar *previous = NULL;
|
|
pcre_uchar *previous_callout = NULL;
|
|
-size_t save_hwm_offset = 0;
|
|
+size_t item_hwm_offset = 0;
|
|
pcre_uint8 classbits[32];
|
|
|
|
/* We can fish out the UTF-8 setting once and for all into a BOOL, but we
|
|
@@ -4623,8 +4639,7 @@ for (;; ptr++)
|
|
/* In the real compile phase, just check the workspace used by the forward
|
|
reference list. */
|
|
|
|
- else if (cd->hwm > cd->start_workspace + cd->workspace_size -
|
|
- WORK_SIZE_SAFETY_MARGIN)
|
|
+ else if (cd->hwm > cd->start_workspace + cd->workspace_size)
|
|
{
|
|
*errorcodeptr = ERR52;
|
|
goto FAILED;
|
|
@@ -4767,6 +4782,7 @@ for (;; ptr++)
|
|
zeroreqchar = reqchar;
|
|
zeroreqcharflags = reqcharflags;
|
|
previous = code;
|
|
+ item_hwm_offset = cd->hwm - cd->start_workspace;
|
|
*code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
|
|
break;
|
|
|
|
@@ -4818,6 +4834,7 @@ for (;; ptr++)
|
|
/* Handle a real character class. */
|
|
|
|
previous = code;
|
|
+ item_hwm_offset = cd->hwm - cd->start_workspace;
|
|
|
|
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
|
|
they are encountered at the top level, so we'll do that too. */
|
|
@@ -4923,9 +4940,10 @@ for (;; ptr++)
|
|
(which is on the stack). We have to remember that there was XCLASS data,
|
|
however. */
|
|
|
|
+ if (class_uchardata > class_uchardata_base) xclass = TRUE;
|
|
+
|
|
if (lengthptr != NULL && class_uchardata > class_uchardata_base)
|
|
{
|
|
- xclass = TRUE;
|
|
*lengthptr += (int)(class_uchardata - class_uchardata_base);
|
|
class_uchardata = class_uchardata_base;
|
|
}
|
|
@@ -5028,10 +5046,26 @@ for (;; ptr++)
|
|
ptr = tempptr + 1;
|
|
continue;
|
|
|
|
- /* For all other POSIX classes, no special action is taken in UCP
|
|
- mode. Fall through to the non_UCP case. */
|
|
+ /* For the other POSIX classes (ascii, xdigit) we are going to fall
|
|
+ through to the non-UCP case and build a bit map for characters with
|
|
+ code points less than 256. If we are in a negated POSIX class
|
|
+ within a non-negated overall class, characters with code points
|
|
+ greater than 255 must all match. In the special case where we have
|
|
+ not yet generated any xclass data, and this is the final item in
|
|
+ the overall class, we need do nothing: later on, the opcode
|
|
+ OP_NCLASS will be used to indicate that characters greater than 255
|
|
+ are acceptable. If we have already seen an xclass item or one may
|
|
+ follow (we have to assume that it might if this is not the end of
|
|
+ the class), explicitly match all wide codepoints. */
|
|
|
|
default:
|
|
+ if (!negate_class && local_negate &&
|
|
+ (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
|
|
+ {
|
|
+ *class_uchardata++ = XCL_RANGE;
|
|
+ class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
|
|
+ class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
|
|
+ }
|
|
break;
|
|
}
|
|
}
|
|
@@ -5195,9 +5229,9 @@ for (;; ptr++)
|
|
cd, PRIV(vspace_list));
|
|
continue;
|
|
|
|
-#ifdef SUPPORT_UCP
|
|
case ESC_p:
|
|
case ESC_P:
|
|
+#ifdef SUPPORT_UCP
|
|
{
|
|
BOOL negated;
|
|
unsigned int ptype = 0, pdata = 0;
|
|
@@ -5211,6 +5245,9 @@ for (;; ptr++)
|
|
class_has_8bitchar--; /* Undo! */
|
|
continue;
|
|
}
|
|
+#else
|
|
+ *errorcodeptr = ERR45;
|
|
+ goto FAILED;
|
|
#endif
|
|
/* Unrecognized escapes are faulted if PCRE is running in its
|
|
strict mode. By default, for compatibility with Perl, they are
|
|
@@ -5367,16 +5404,20 @@ for (;; ptr++)
|
|
CLASS_SINGLE_CHARACTER:
|
|
if (class_one_char < 2) class_one_char++;
|
|
|
|
- /* If class_one_char is 1, we have the first single character in the
|
|
- class, and there have been no prior ranges, or XCLASS items generated by
|
|
- escapes. If this is the final character in the class, we can optimize by
|
|
- turning the item into a 1-character OP_CHAR[I] if it's positive, or
|
|
- OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
|
|
- to be set. Otherwise, there can be no first char if this item is first,
|
|
- whatever repeat count may follow. In the case of reqchar, save the
|
|
- previous value for reinstating. */
|
|
+ /* If xclass_has_prop is false and class_one_char is 1, we have the first
|
|
+ single character in the class, and there have been no prior ranges, or
|
|
+ XCLASS items generated by escapes. If this is the final character in the
|
|
+ class, we can optimize by turning the item into a 1-character OP_CHAR[I]
|
|
+ if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
|
|
+ can cause firstchar to be set. Otherwise, there can be no first char if
|
|
+ this item is first, whatever repeat count may follow. In the case of
|
|
+ reqchar, save the previous value for reinstating. */
|
|
|
|
- if (!inescq && class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
|
|
+ if (!inescq &&
|
|
+#ifdef SUPPORT_UCP
|
|
+ !xclass_has_prop &&
|
|
+#endif
|
|
+ class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
|
|
{
|
|
ptr++;
|
|
zeroreqchar = reqchar;
|
|
@@ -5492,9 +5533,10 @@ for (;; ptr++)
|
|
actual compiled code. */
|
|
|
|
#ifdef SUPPORT_UTF
|
|
- if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
|
|
+ if (xclass && (xclass_has_prop || !should_flip_negation ||
|
|
+ (options & PCRE_UCP) != 0))
|
|
#elif !defined COMPILE_PCRE8
|
|
- if (xclass && !should_flip_negation)
|
|
+ if (xclass && (xclass_has_prop || !should_flip_negation))
|
|
#endif
|
|
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
|
|
{
|
|
@@ -5930,7 +5972,7 @@ for (;; ptr++)
|
|
{
|
|
register int i;
|
|
int len = (int)(code - previous);
|
|
- size_t base_hwm_offset = save_hwm_offset;
|
|
+ size_t base_hwm_offset = item_hwm_offset;
|
|
pcre_uchar *bralink = NULL;
|
|
pcre_uchar *brazeroptr = NULL;
|
|
|
|
@@ -5985,7 +6027,7 @@ for (;; ptr++)
|
|
if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
|
|
{
|
|
*code = OP_END;
|
|
- adjust_recurse(previous, 1, utf, cd, save_hwm_offset);
|
|
+ adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
|
|
memmove(previous + 1, previous, IN_UCHARS(len));
|
|
code++;
|
|
if (repeat_max == 0)
|
|
@@ -6009,7 +6051,7 @@ for (;; ptr++)
|
|
{
|
|
int offset;
|
|
*code = OP_END;
|
|
- adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm_offset);
|
|
+ adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
|
|
memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
|
|
code += 2 + LINK_SIZE;
|
|
*previous++ = OP_BRAZERO + repeat_type;
|
|
@@ -6254,6 +6296,12 @@ for (;; ptr++)
|
|
while (*scode == OP_ALT);
|
|
}
|
|
|
|
+ /* A conditional group with only one branch has an implicit empty
|
|
+ alternative branch. */
|
|
+
|
|
+ if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
|
|
+ *bracode = OP_SCOND;
|
|
+
|
|
/* Handle possessive quantifiers. */
|
|
|
|
if (possessive_quantifier)
|
|
@@ -6267,11 +6315,11 @@ for (;; ptr++)
|
|
{
|
|
int nlen = (int)(code - bracode);
|
|
*code = OP_END;
|
|
- adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
|
|
+ adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
|
|
memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
|
|
code += 1 + LINK_SIZE;
|
|
nlen += 1 + LINK_SIZE;
|
|
- *bracode = OP_BRAPOS;
|
|
+ *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
|
|
*code++ = OP_KETRPOS;
|
|
PUTINC(code, 0, nlen);
|
|
PUT(bracode, 1, nlen);
|
|
@@ -6401,7 +6449,7 @@ for (;; ptr++)
|
|
else
|
|
{
|
|
*code = OP_END;
|
|
- adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
|
|
+ adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
|
|
memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
|
|
code += 1 + LINK_SIZE;
|
|
len += 1 + LINK_SIZE;
|
|
@@ -6450,7 +6498,7 @@ for (;; ptr++)
|
|
|
|
default:
|
|
*code = OP_END;
|
|
- adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
|
|
+ adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
|
|
memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
|
|
code += 1 + LINK_SIZE;
|
|
len += 1 + LINK_SIZE;
|
|
@@ -6586,9 +6634,17 @@ for (;; ptr++)
|
|
goto FAILED;
|
|
}
|
|
setverb = *code++ = verbs[i].op_arg;
|
|
- *code++ = arglen;
|
|
- memcpy(code, arg, IN_UCHARS(arglen));
|
|
- code += arglen;
|
|
+ if (lengthptr != NULL) /* In pass 1 just add in the length */
|
|
+ { /* to avoid potential workspace */
|
|
+ *lengthptr += arglen; /* overflow. */
|
|
+ *code++ = 0;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ *code++ = arglen;
|
|
+ memcpy(code, arg, IN_UCHARS(arglen));
|
|
+ code += arglen;
|
|
+ }
|
|
*code++ = 0;
|
|
}
|
|
|
|
@@ -6623,7 +6679,7 @@ for (;; ptr++)
|
|
newoptions = options;
|
|
skipbytes = 0;
|
|
bravalue = OP_CBRA;
|
|
- save_hwm_offset = cd->hwm - cd->start_workspace;
|
|
+ item_hwm_offset = cd->hwm - cd->start_workspace;
|
|
reset_bracount = FALSE;
|
|
|
|
/* Deal with the extended parentheses; all are introduced by '?', and the
|
|
@@ -6641,6 +6697,7 @@ for (;; ptr++)
|
|
/* ------------------------------------------------------------ */
|
|
case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
|
|
reset_bracount = TRUE;
|
|
+ cd->dupgroups = TRUE; /* Record (?| encountered */
|
|
/* Fall through */
|
|
|
|
/* ------------------------------------------------------------ */
|
|
@@ -6741,6 +6798,12 @@ for (;; ptr++)
|
|
{
|
|
while (IS_DIGIT(*ptr))
|
|
{
|
|
+ if (recno > INT_MAX / 10 - 1) /* Integer overflow */
|
|
+ {
|
|
+ while (IS_DIGIT(*ptr)) ptr++;
|
|
+ *errorcodeptr = ERR61;
|
|
+ goto FAILED;
|
|
+ }
|
|
recno = recno * 10 + (int)(*ptr - CHAR_0);
|
|
ptr++;
|
|
}
|
|
@@ -6769,7 +6832,7 @@ for (;; ptr++)
|
|
ptr++;
|
|
}
|
|
namelen = (int)(ptr - name);
|
|
- if (lengthptr != NULL) *lengthptr += IMM2_SIZE;
|
|
+ if (lengthptr != NULL) skipbytes += IMM2_SIZE;
|
|
}
|
|
|
|
/* Check the terminator */
|
|
@@ -6875,6 +6938,11 @@ for (;; ptr++)
|
|
*errorcodeptr = ERR15;
|
|
goto FAILED;
|
|
}
|
|
+ if (recno > INT_MAX / 10 - 1) /* Integer overflow */
|
|
+ {
|
|
+ *errorcodeptr = ERR61;
|
|
+ goto FAILED;
|
|
+ }
|
|
recno = recno * 10 + name[i] - CHAR_0;
|
|
}
|
|
if (recno == 0) recno = RREF_ANY;
|
|
@@ -7151,6 +7219,7 @@ for (;; ptr++)
|
|
if (lengthptr != NULL)
|
|
{
|
|
named_group *ng;
|
|
+ recno = 0;
|
|
|
|
if (namelen == 0)
|
|
{
|
|
@@ -7168,20 +7237,6 @@ for (;; ptr++)
|
|
goto FAILED;
|
|
}
|
|
|
|
- /* The name table does not exist in the first pass; instead we must
|
|
- scan the list of names encountered so far in order to get the
|
|
- number. If the name is not found, set the value to 0 for a forward
|
|
- reference. */
|
|
-
|
|
- ng = cd->named_groups;
|
|
- for (i = 0; i < cd->names_found; i++, ng++)
|
|
- {
|
|
- if (namelen == ng->length &&
|
|
- STRNCMP_UC_UC(name, ng->name, namelen) == 0)
|
|
- break;
|
|
- }
|
|
- recno = (i < cd->names_found)? ng->number : 0;
|
|
-
|
|
/* Count named back references. */
|
|
|
|
if (!is_recurse) cd->namedrefcount++;
|
|
@@ -7191,6 +7246,56 @@ for (;; ptr++)
|
|
16-bit data item. */
|
|
|
|
*lengthptr += IMM2_SIZE;
|
|
+
|
|
+ /* If this is a forward reference and we are within a (?|...) group,
|
|
+ the reference may end up as the number of a group which we are
|
|
+ currently inside, that is, it could be a recursive reference. In the
|
|
+ real compile this will be picked up and the reference wrapped with
|
|
+ OP_ONCE to make it atomic, so we must space in case this occurs. */
|
|
+
|
|
+ /* In fact, this can happen for a non-forward reference because
|
|
+ another group with the same number might be created later. This
|
|
+ issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
|
|
+ only mode, we finesse the bug by allowing more memory always. */
|
|
+
|
|
+ *lengthptr += 2 + 2*LINK_SIZE;
|
|
+
|
|
+ /* It is even worse than that. The current reference may be to an
|
|
+ existing named group with a different number (so apparently not
|
|
+ recursive) but which later on is also attached to a group with the
|
|
+ current number. This can only happen if $(| has been previous
|
|
+ encountered. In that case, we allow yet more memory, just in case.
|
|
+ (Again, this is fixed "properly" in PCRE2. */
|
|
+
|
|
+ if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
|
|
+
|
|
+ /* Otherwise, check for recursion here. The name table does not exist
|
|
+ in the first pass; instead we must scan the list of names encountered
|
|
+ so far in order to get the number. If the name is not found, leave
|
|
+ the value of recno as 0 for a forward reference. */
|
|
+
|
|
+ else
|
|
+ {
|
|
+ ng = cd->named_groups;
|
|
+ for (i = 0; i < cd->names_found; i++, ng++)
|
|
+ {
|
|
+ if (namelen == ng->length &&
|
|
+ STRNCMP_UC_UC(name, ng->name, namelen) == 0)
|
|
+ {
|
|
+ open_capitem *oc;
|
|
+ recno = ng->number;
|
|
+ if (is_recurse) break;
|
|
+ for (oc = cd->open_caps; oc != NULL; oc = oc->next)
|
|
+ {
|
|
+ if (oc->number == recno)
|
|
+ {
|
|
+ oc->flag = TRUE;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
/* In the real compile, search the name table. We check the name
|
|
@@ -7237,8 +7342,6 @@ for (;; ptr++)
|
|
for (i++; i < cd->names_found; i++)
|
|
{
|
|
if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
|
|
-
|
|
-
|
|
count++;
|
|
cslot += cd->name_entry_size;
|
|
}
|
|
@@ -7247,6 +7350,7 @@ for (;; ptr++)
|
|
{
|
|
if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
|
|
previous = code;
|
|
+ item_hwm_offset = cd->hwm - cd->start_workspace;
|
|
*code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
|
|
PUT2INC(code, 0, index);
|
|
PUT2INC(code, 0, count);
|
|
@@ -7284,9 +7388,14 @@ for (;; ptr++)
|
|
|
|
|
|
/* ------------------------------------------------------------ */
|
|
- case CHAR_R: /* Recursion */
|
|
- ptr++; /* Same as (?0) */
|
|
- /* Fall through */
|
|
+ case CHAR_R: /* Recursion, same as (?0) */
|
|
+ recno = 0;
|
|
+ if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
|
|
+ {
|
|
+ *errorcodeptr = ERR29;
|
|
+ goto FAILED;
|
|
+ }
|
|
+ goto HANDLE_RECURSION;
|
|
|
|
|
|
/* ------------------------------------------------------------ */
|
|
@@ -7323,7 +7432,15 @@ for (;; ptr++)
|
|
|
|
recno = 0;
|
|
while(IS_DIGIT(*ptr))
|
|
+ {
|
|
+ if (recno > INT_MAX / 10 - 1) /* Integer overflow */
|
|
+ {
|
|
+ while (IS_DIGIT(*ptr)) ptr++;
|
|
+ *errorcodeptr = ERR61;
|
|
+ goto FAILED;
|
|
+ }
|
|
recno = recno * 10 + *ptr++ - CHAR_0;
|
|
+ }
|
|
|
|
if (*ptr != (pcre_uchar)terminator)
|
|
{
|
|
@@ -7360,6 +7477,7 @@ for (;; ptr++)
|
|
HANDLE_RECURSION:
|
|
|
|
previous = code;
|
|
+ item_hwm_offset = cd->hwm - cd->start_workspace;
|
|
called = cd->start_code;
|
|
|
|
/* When we are actually compiling, find the bracket that is being
|
|
@@ -7561,7 +7679,11 @@ for (;; ptr++)
|
|
previous = NULL;
|
|
cd->iscondassert = FALSE;
|
|
}
|
|
- else previous = code;
|
|
+ else
|
|
+ {
|
|
+ previous = code;
|
|
+ item_hwm_offset = cd->hwm - cd->start_workspace;
|
|
+ }
|
|
|
|
*code = bravalue;
|
|
tempcode = code;
|
|
@@ -7809,7 +7931,7 @@ for (;; ptr++)
|
|
const pcre_uchar *p;
|
|
pcre_uint32 cf;
|
|
|
|
- save_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */
|
|
+ item_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */
|
|
terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
|
|
CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
|
|
|
|
@@ -7838,7 +7960,7 @@ for (;; ptr++)
|
|
if (*p != (pcre_uchar)terminator)
|
|
{
|
|
*errorcodeptr = ERR57;
|
|
- break;
|
|
+ goto FAILED;
|
|
}
|
|
ptr++;
|
|
goto HANDLE_NUMERICAL_RECURSION;
|
|
@@ -7853,7 +7975,7 @@ for (;; ptr++)
|
|
ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
|
|
{
|
|
*errorcodeptr = ERR69;
|
|
- break;
|
|
+ goto FAILED;
|
|
}
|
|
is_recurse = FALSE;
|
|
terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
|
|
@@ -7877,6 +7999,7 @@ for (;; ptr++)
|
|
HANDLE_REFERENCE:
|
|
if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
|
|
previous = code;
|
|
+ item_hwm_offset = cd->hwm - cd->start_workspace;
|
|
*code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
|
|
PUT2INC(code, 0, recno);
|
|
cd->backref_map |= (recno < 32)? (1 << recno) : 1;
|
|
@@ -7906,6 +8029,7 @@ for (;; ptr++)
|
|
if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
|
|
goto FAILED;
|
|
previous = code;
|
|
+ item_hwm_offset = cd->hwm - cd->start_workspace;
|
|
*code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
|
|
*code++ = ptype;
|
|
*code++ = pdata;
|
|
@@ -7946,6 +8070,7 @@ for (;; ptr++)
|
|
|
|
{
|
|
previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
|
|
+ item_hwm_offset = cd->hwm - cd->start_workspace;
|
|
*code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
|
|
}
|
|
}
|
|
@@ -7989,6 +8114,7 @@ for (;; ptr++)
|
|
|
|
ONE_CHAR:
|
|
previous = code;
|
|
+ item_hwm_offset = cd->hwm - cd->start_workspace;
|
|
|
|
/* For caseless UTF-8 mode when UCP support is available, check whether
|
|
this character has more than one other case. If so, generate a special
|
|
@@ -9164,6 +9290,7 @@ cd->names_found = 0;
|
|
cd->name_entry_size = 0;
|
|
cd->name_table = NULL;
|
|
cd->dupnames = FALSE;
|
|
+cd->dupgroups = FALSE;
|
|
cd->namedrefcount = 0;
|
|
cd->start_code = cworkspace;
|
|
cd->hwm = cworkspace;
|
|
@@ -9336,6 +9463,16 @@ if (cd->hwm > cd->start_workspace)
|
|
int offset, recno;
|
|
cd->hwm -= LINK_SIZE;
|
|
offset = GET(cd->hwm, 0);
|
|
+
|
|
+ /* Check that the hwm handling hasn't gone wrong. This whole area is
|
|
+ rewritten in PCRE2 because there are some obscure cases. */
|
|
+
|
|
+ if (offset == 0 || codestart[offset-1] != OP_RECURSE)
|
|
+ {
|
|
+ errorcode = ERR10;
|
|
+ break;
|
|
+ }
|
|
+
|
|
recno = GET(codestart, offset);
|
|
if (recno != prev_recno)
|
|
{
|
|
@@ -9366,7 +9503,7 @@ used in this code because at least one compiler gives a warning about loss of
|
|
"const" attribute if the cast (pcre_uchar *)codestart is used directly in the
|
|
function call. */
|
|
|
|
-if ((options & PCRE_NO_AUTO_POSSESS) == 0)
|
|
+if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
|
|
{
|
|
pcre_uchar *temp = (pcre_uchar *)codestart;
|
|
auto_possessify(temp, utf, cd);
|
|
@@ -9380,7 +9517,7 @@ OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
|
|
exceptional ones forgo this. We scan the pattern to check that they are fixed
|
|
length, and set their lengths. */
|
|
|
|
-if (cd->check_lookbehind)
|
|
+if (errorcode == 0 && cd->check_lookbehind)
|
|
{
|
|
pcre_uchar *cc = (pcre_uchar *)codestart;
|
|
|
|
@@ -9593,4 +9730,3 @@ return (pcre32 *)re;
|
|
}
|
|
|
|
/* End of pcre_compile.c */
|
|
-
|
|
diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c
|
|
index 3942076..24b23ca 100644
|
|
--- a/ext/pcre/pcrelib/pcre_exec.c
|
|
+++ b/ext/pcre/pcrelib/pcre_exec.c
|
|
@@ -688,7 +688,7 @@ the alternative names that are used. */
|
|
#define foc number
|
|
#define save_mark data
|
|
|
|
-/* These statements are here to stop the compiler complaining about uninitialized
|
|
+/* These statements are here to stop the compiler complaining about unitialized
|
|
variables. */
|
|
|
|
#ifdef SUPPORT_UCP
|
|
@@ -6685,7 +6685,8 @@ if (md->offset_vector != NULL)
|
|
register int *iend = iptr - re->top_bracket;
|
|
if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
|
|
while (--iptr >= iend) *iptr = -1;
|
|
- md->offset_vector[0] = md->offset_vector[1] = -1;
|
|
+ if (offsetcount > 0) md->offset_vector[0] = -1;
|
|
+ if (offsetcount > 1) md->offset_vector[1] = -1;
|
|
}
|
|
|
|
/* Set up the first character to match, if available. The first_char value is
|
|
diff --git a/ext/pcre/pcrelib/pcre_internal.h b/ext/pcre/pcrelib/pcre_internal.h
|
|
index 4c4817d..aec1879 100644
|
|
--- a/ext/pcre/pcrelib/pcre_internal.h
|
|
+++ b/ext/pcre/pcrelib/pcre_internal.h
|
|
@@ -988,7 +988,7 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */
|
|
#ifndef EBCDIC
|
|
|
|
#define HSPACE_LIST \
|
|
- CHAR_HT, CHAR_SPACE, 0xa0, \
|
|
+ CHAR_HT, CHAR_SPACE, CHAR_NBSP, \
|
|
0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
|
|
0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \
|
|
NOTACHAR
|
|
@@ -1014,7 +1014,7 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */
|
|
#define HSPACE_BYTE_CASES \
|
|
case CHAR_HT: \
|
|
case CHAR_SPACE: \
|
|
- case 0xa0 /* NBSP */
|
|
+ case CHAR_NBSP
|
|
|
|
#define HSPACE_CASES \
|
|
HSPACE_BYTE_CASES: \
|
|
@@ -1041,11 +1041,12 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */
|
|
/* ------ EBCDIC environments ------ */
|
|
|
|
#else
|
|
-#define HSPACE_LIST CHAR_HT, CHAR_SPACE
|
|
+#define HSPACE_LIST CHAR_HT, CHAR_SPACE, CHAR_NBSP, NOTACHAR
|
|
|
|
#define HSPACE_BYTE_CASES \
|
|
case CHAR_HT: \
|
|
- case CHAR_SPACE
|
|
+ case CHAR_SPACE: \
|
|
+ case CHAR_NBSP
|
|
|
|
#define HSPACE_CASES HSPACE_BYTE_CASES
|
|
|
|
@@ -1219,6 +1220,7 @@ same code point. */
|
|
|
|
#define CHAR_ESC '\047'
|
|
#define CHAR_DEL '\007'
|
|
+#define CHAR_NBSP '\x41'
|
|
#define STR_ESC "\047"
|
|
#define STR_DEL "\007"
|
|
|
|
@@ -1233,6 +1235,7 @@ a positive value. */
|
|
#define CHAR_NEL ((unsigned char)'\x85')
|
|
#define CHAR_ESC '\033'
|
|
#define CHAR_DEL '\177'
|
|
+#define CHAR_NBSP ((unsigned char)'\xa0')
|
|
|
|
#define STR_LF "\n"
|
|
#define STR_NL STR_LF
|
|
@@ -1610,6 +1613,7 @@ only. */
|
|
#define CHAR_VERTICAL_LINE '\174'
|
|
#define CHAR_RIGHT_CURLY_BRACKET '\175'
|
|
#define CHAR_TILDE '\176'
|
|
+#define CHAR_NBSP ((unsigned char)'\xa0')
|
|
|
|
#define STR_HT "\011"
|
|
#define STR_VT "\013"
|
|
@@ -1766,6 +1770,10 @@ only. */
|
|
|
|
/* Escape items that are just an encoding of a particular data value. */
|
|
|
|
+#ifndef ESC_a
|
|
+#define ESC_a CHAR_BEL
|
|
+#endif
|
|
+
|
|
#ifndef ESC_e
|
|
#define ESC_e CHAR_ESC
|
|
#endif
|
|
@@ -2450,6 +2458,7 @@ typedef struct compile_data {
|
|
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
|
|
BOOL check_lookbehind; /* Lookbehinds need later checking */
|
|
BOOL dupnames; /* Duplicate names exist */
|
|
+ BOOL dupgroups; /* Duplicate groups exist: (?| found */
|
|
BOOL iscondassert; /* Next assert is a condition */
|
|
int nltype; /* Newline type */
|
|
int nllen; /* Newline string length */
|
|
diff --git a/ext/pcre/pcrelib/pcre_jit_compile.c b/ext/pcre/pcrelib/pcre_jit_compile.c
|
|
index debdf6e..445de0c 100644
|
|
--- a/ext/pcre/pcrelib/pcre_jit_compile.c
|
|
+++ b/ext/pcre/pcrelib/pcre_jit_compile.c
|
|
@@ -1064,6 +1064,7 @@ pcre_uchar *alternative;
|
|
pcre_uchar *end = NULL;
|
|
int private_data_ptr = *private_data_start;
|
|
int space, size, bracketlen;
|
|
+BOOL repeat_check = TRUE;
|
|
|
|
while (cc < ccend)
|
|
{
|
|
@@ -1071,9 +1072,10 @@ while (cc < ccend)
|
|
size = 0;
|
|
bracketlen = 0;
|
|
if (private_data_ptr > SLJIT_MAX_LOCAL_SIZE)
|
|
- return;
|
|
+ break;
|
|
|
|
- if (*cc == OP_ONCE || *cc == OP_ONCE_NC || *cc == OP_BRA || *cc == OP_CBRA || *cc == OP_COND)
|
|
+ if (repeat_check && (*cc == OP_ONCE || *cc == OP_ONCE_NC || *cc == OP_BRA || *cc == OP_CBRA || *cc == OP_COND))
|
|
+ {
|
|
if (detect_repeat(common, cc))
|
|
{
|
|
/* These brackets are converted to repeats, so no global
|
|
@@ -1081,6 +1083,8 @@ while (cc < ccend)
|
|
if (cc >= end)
|
|
end = bracketend(cc);
|
|
}
|
|
+ }
|
|
+ repeat_check = TRUE;
|
|
|
|
switch(*cc)
|
|
{
|
|
@@ -1136,6 +1140,13 @@ while (cc < ccend)
|
|
bracketlen = 1 + LINK_SIZE + IMM2_SIZE;
|
|
break;
|
|
|
|
+ case OP_BRAZERO:
|
|
+ case OP_BRAMINZERO:
|
|
+ case OP_BRAPOSZERO:
|
|
+ repeat_check = FALSE;
|
|
+ size = 1;
|
|
+ break;
|
|
+
|
|
CASE_ITERATOR_PRIVATE_DATA_1
|
|
space = 1;
|
|
size = -2;
|
|
@@ -1162,12 +1173,17 @@ while (cc < ccend)
|
|
size = 1;
|
|
break;
|
|
|
|
- CASE_ITERATOR_TYPE_PRIVATE_DATA_2B
|
|
+ case OP_TYPEUPTO:
|
|
if (cc[1 + IMM2_SIZE] != OP_ANYNL && cc[1 + IMM2_SIZE] != OP_EXTUNI)
|
|
space = 2;
|
|
size = 1 + IMM2_SIZE;
|
|
break;
|
|
|
|
+ case OP_TYPEMINUPTO:
|
|
+ space = 2;
|
|
+ size = 1 + IMM2_SIZE;
|
|
+ break;
|
|
+
|
|
case OP_CLASS:
|
|
case OP_NCLASS:
|
|
size += 1 + 32 / sizeof(pcre_uchar);
|
|
@@ -1316,6 +1332,13 @@ while (cc < ccend)
|
|
cc += 1 + LINK_SIZE + IMM2_SIZE;
|
|
break;
|
|
|
|
+ case OP_THEN:
|
|
+ stack_restore = TRUE;
|
|
+ if (common->control_head_ptr != 0)
|
|
+ *needs_control_head = TRUE;
|
|
+ cc ++;
|
|
+ break;
|
|
+
|
|
default:
|
|
stack_restore = TRUE;
|
|
/* Fall through. */
|
|
@@ -2220,6 +2243,7 @@ while (current != NULL)
|
|
SLJIT_ASSERT_STOP();
|
|
break;
|
|
}
|
|
+ SLJIT_ASSERT(current > (sljit_sw*)current[-1]);
|
|
current = (sljit_sw*)current[-1];
|
|
}
|
|
return -1;
|
|
@@ -3209,7 +3233,7 @@ bytes[len] = byte;
|
|
bytes[0] = len;
|
|
}
|
|
|
|
-static int scan_prefix(compiler_common *common, pcre_uchar *cc, pcre_uint32 *chars, pcre_uint8 *bytes, int max_chars)
|
|
+static int scan_prefix(compiler_common *common, pcre_uchar *cc, pcre_uint32 *chars, pcre_uint8 *bytes, int max_chars, pcre_uint32 *rec_count)
|
|
{
|
|
/* Recursive function, which scans prefix literals. */
|
|
BOOL last, any, caseless;
|
|
@@ -3227,9 +3251,14 @@ pcre_uchar othercase[1];
|
|
repeat = 1;
|
|
while (TRUE)
|
|
{
|
|
+ if (*rec_count == 0)
|
|
+ return 0;
|
|
+ (*rec_count)--;
|
|
+
|
|
last = TRUE;
|
|
any = FALSE;
|
|
caseless = FALSE;
|
|
+
|
|
switch (*cc)
|
|
{
|
|
case OP_CHARI:
|
|
@@ -3291,7 +3320,7 @@ while (TRUE)
|
|
#ifdef SUPPORT_UTF
|
|
if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc);
|
|
#endif
|
|
- max_chars = scan_prefix(common, cc + len, chars, bytes, max_chars);
|
|
+ max_chars = scan_prefix(common, cc + len, chars, bytes, max_chars, rec_count);
|
|
if (max_chars == 0)
|
|
return consumed;
|
|
last = FALSE;
|
|
@@ -3314,7 +3343,7 @@ while (TRUE)
|
|
alternative = cc + GET(cc, 1);
|
|
while (*alternative == OP_ALT)
|
|
{
|
|
- max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, bytes, max_chars);
|
|
+ max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, bytes, max_chars, rec_count);
|
|
if (max_chars == 0)
|
|
return consumed;
|
|
alternative += GET(alternative, 1);
|
|
@@ -3556,6 +3585,7 @@ int i, max, from;
|
|
int range_right = -1, range_len = 3 - 1;
|
|
sljit_ub *update_table = NULL;
|
|
BOOL in_range;
|
|
+pcre_uint32 rec_count;
|
|
|
|
for (i = 0; i < MAX_N_CHARS; i++)
|
|
{
|
|
@@ -3564,7 +3594,8 @@ for (i = 0; i < MAX_N_CHARS; i++)
|
|
bytes[i * MAX_N_BYTES] = 0;
|
|
}
|
|
|
|
-max = scan_prefix(common, common->start, chars, bytes, MAX_N_CHARS);
|
|
+rec_count = 10000;
|
|
+max = scan_prefix(common, common->start, chars, bytes, MAX_N_CHARS, &rec_count);
|
|
|
|
if (max <= 1)
|
|
return FALSE;
|
|
@@ -4311,8 +4342,10 @@ switch(length)
|
|
case 4:
|
|
if ((ranges[1] - ranges[0]) == (ranges[3] - ranges[2])
|
|
&& (ranges[0] | (ranges[2] - ranges[0])) == ranges[2]
|
|
+ && (ranges[1] & (ranges[2] - ranges[0])) == 0
|
|
&& is_powerof2(ranges[2] - ranges[0]))
|
|
{
|
|
+ SLJIT_ASSERT((ranges[0] & (ranges[2] - ranges[0])) == 0 && (ranges[2] & ranges[3] & (ranges[2] - ranges[0])) != 0);
|
|
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2] - ranges[0]);
|
|
if (ranges[2] + 1 != ranges[3])
|
|
{
|
|
@@ -4900,9 +4933,10 @@ else if ((cc[-1] & XCL_MAP) != 0)
|
|
if (!check_class_ranges(common, (const pcre_uint8 *)cc, FALSE, TRUE, list))
|
|
{
|
|
#ifdef COMPILE_PCRE8
|
|
- SLJIT_ASSERT(common->utf);
|
|
+ jump = NULL;
|
|
+ if (common->utf)
|
|
#endif
|
|
- jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
|
|
+ jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
|
|
|
|
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
|
|
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
|
|
@@ -4911,7 +4945,10 @@ else if ((cc[-1] & XCL_MAP) != 0)
|
|
OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0);
|
|
add_jump(compiler, list, JUMP(SLJIT_NOT_ZERO));
|
|
|
|
- JUMPHERE(jump);
|
|
+#ifdef COMPILE_PCRE8
|
|
+ if (common->utf)
|
|
+#endif
|
|
+ JUMPHERE(jump);
|
|
}
|
|
|
|
OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
|
|
@@ -5219,7 +5256,7 @@ while (*cc != XCL_END)
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL);
|
|
|
|
SET_CHAR_OFFSET(0);
|
|
- OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xff);
|
|
+ OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x7f);
|
|
OP_FLAGS(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
|
|
SET_TYPE_OFFSET(ucp_Pc);
|
|
@@ -7665,6 +7702,10 @@ while (*cc != OP_KETRPOS)
|
|
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0);
|
|
}
|
|
|
|
+ /* Even if the match is empty, we need to reset the control head. */
|
|
+ if (needs_control_head)
|
|
+ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack));
|
|
+
|
|
if (opcode == OP_SBRAPOS || opcode == OP_SCBRAPOS)
|
|
add_jump(compiler, &emptymatch, CMP(SLJIT_EQUAL, TMP1, 0, STR_PTR, 0));
|
|
|
|
@@ -7692,6 +7733,10 @@ while (*cc != OP_KETRPOS)
|
|
OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), (framesize + 1) * sizeof(sljit_sw), STR_PTR, 0);
|
|
}
|
|
|
|
+ /* Even if the match is empty, we need to reset the control head. */
|
|
+ if (needs_control_head)
|
|
+ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack));
|
|
+
|
|
if (opcode == OP_SBRAPOS || opcode == OP_SCBRAPOS)
|
|
add_jump(compiler, &emptymatch, CMP(SLJIT_EQUAL, TMP1, 0, STR_PTR, 0));
|
|
|
|
@@ -7704,9 +7749,6 @@ while (*cc != OP_KETRPOS)
|
|
}
|
|
}
|
|
|
|
- if (needs_control_head)
|
|
- OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack));
|
|
-
|
|
JUMPTO(SLJIT_JUMP, loop);
|
|
flush_stubs(common);
|
|
|
|
@@ -8441,8 +8483,7 @@ while (cc < ccend)
|
|
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), STR_PTR, 0);
|
|
}
|
|
BACKTRACK_AS(braminzero_backtrack)->matchingpath = LABEL();
|
|
- if (cc[1] > OP_ASSERTBACK_NOT)
|
|
- count_match(common);
|
|
+ count_match(common);
|
|
break;
|
|
|
|
case OP_ONCE:
|
|
@@ -9624,7 +9665,7 @@ static SLJIT_INLINE void compile_recurse(compiler_common *common)
|
|
DEFINE_COMPILER;
|
|
pcre_uchar *cc = common->start + common->currententry->start;
|
|
pcre_uchar *ccbegin = cc + 1 + LINK_SIZE + (*cc == OP_BRA ? 0 : IMM2_SIZE);
|
|
-pcre_uchar *ccend = bracketend(cc);
|
|
+pcre_uchar *ccend = bracketend(cc) - (1 + LINK_SIZE);
|
|
BOOL needs_control_head;
|
|
int framesize = get_framesize(common, cc, NULL, TRUE, &needs_control_head);
|
|
int private_data_size = get_private_data_copy_length(common, ccbegin, ccend, needs_control_head);
|
|
@@ -9648,6 +9689,7 @@ set_jumps(common->currententry->calls, common->currententry->entry);
|
|
|
|
sljit_emit_fast_enter(compiler, TMP2, 0);
|
|
allocate_stack(common, private_data_size + framesize + alternativesize);
|
|
+count_match(common);
|
|
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(private_data_size + framesize + alternativesize - 1), TMP2, 0);
|
|
copy_private_data(common, ccbegin, ccend, TRUE, private_data_size + framesize + alternativesize, framesize + alternativesize, needs_control_head);
|
|
if (needs_control_head)
|
|
@@ -9992,6 +10034,7 @@ OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, stack));
|
|
OP1(SLJIT_MOV_UI, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, limit_match));
|
|
OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(struct sljit_stack, base));
|
|
OP1(SLJIT_MOV, STACK_LIMIT, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(struct sljit_stack, limit));
|
|
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
|
|
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LIMIT_MATCH, TMP1, 0);
|
|
|
|
if (mode == JIT_PARTIAL_SOFT_COMPILE)
|
|
diff --git a/ext/pcre/pcrelib/pcre_study.c b/ext/pcre/pcrelib/pcre_study.c
|
|
index 998fe23..7fd0ba0 100644
|
|
--- a/ext/pcre/pcrelib/pcre_study.c
|
|
+++ b/ext/pcre/pcrelib/pcre_study.c
|
|
@@ -71,6 +71,7 @@ rather than bytes.
|
|
startcode pointer to start of the whole pattern's code
|
|
options the compiling options
|
|
recurses chain of recurse_check to catch mutual recursion
|
|
+ countptr pointer to call count (to catch over complexity)
|
|
|
|
Returns: the minimum length
|
|
-1 if \C in UTF-8 mode or (*ACCEPT) was encountered
|
|
@@ -80,7 +81,8 @@ Returns: the minimum length
|
|
|
|
static int
|
|
find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
|
|
- const pcre_uchar *startcode, int options, recurse_check *recurses)
|
|
+ const pcre_uchar *startcode, int options, recurse_check *recurses,
|
|
+ int *countptr)
|
|
{
|
|
int length = -1;
|
|
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
|
|
@@ -90,6 +92,8 @@ recurse_check this_recurse;
|
|
register int branchlength = 0;
|
|
register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
|
|
|
|
+if ((*countptr)++ > 1000) return -1; /* too complex */
|
|
+
|
|
if (*code == OP_CBRA || *code == OP_SCBRA ||
|
|
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
|
|
|
|
@@ -131,7 +135,7 @@ for (;;)
|
|
case OP_SBRAPOS:
|
|
case OP_ONCE:
|
|
case OP_ONCE_NC:
|
|
- d = find_minlength(re, cc, startcode, options, recurses);
|
|
+ d = find_minlength(re, cc, startcode, options, recurses, countptr);
|
|
if (d < 0) return d;
|
|
branchlength += d;
|
|
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
|
@@ -415,7 +419,8 @@ for (;;)
|
|
int dd;
|
|
this_recurse.prev = recurses;
|
|
this_recurse.group = cs;
|
|
- dd = find_minlength(re, cs, startcode, options, &this_recurse);
|
|
+ dd = find_minlength(re, cs, startcode, options, &this_recurse,
|
|
+ countptr);
|
|
if (dd < d) d = dd;
|
|
}
|
|
}
|
|
@@ -451,7 +456,8 @@ for (;;)
|
|
{
|
|
this_recurse.prev = recurses;
|
|
this_recurse.group = cs;
|
|
- d = find_minlength(re, cs, startcode, options, &this_recurse);
|
|
+ d = find_minlength(re, cs, startcode, options, &this_recurse,
|
|
+ countptr);
|
|
}
|
|
}
|
|
}
|
|
@@ -514,7 +520,7 @@ for (;;)
|
|
this_recurse.prev = recurses;
|
|
this_recurse.group = cs;
|
|
branchlength += find_minlength(re, cs, startcode, options,
|
|
- &this_recurse);
|
|
+ &this_recurse, countptr);
|
|
}
|
|
}
|
|
cc += 1 + LINK_SIZE;
|
|
@@ -1453,6 +1459,7 @@ pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
|
|
#endif
|
|
{
|
|
int min;
|
|
+int count = 0;
|
|
BOOL bits_set = FALSE;
|
|
pcre_uint8 start_bits[32];
|
|
PUBL(extra) *extra = NULL;
|
|
@@ -1539,7 +1546,7 @@ if ((re->options & PCRE_ANCHORED) == 0 &&
|
|
|
|
/* Find the minimum length of subject string. */
|
|
|
|
-switch(min = find_minlength(re, code, code, re->options, NULL))
|
|
+switch(min = find_minlength(re, code, code, re->options, NULL, &count))
|
|
{
|
|
case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
|
|
case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
|
|
diff --git a/ext/pcre/pcrelib/pcre_xclass.c b/ext/pcre/pcrelib/pcre_xclass.c
|
|
index c2b61f0..ef759a5 100644
|
|
--- a/ext/pcre/pcrelib/pcre_xclass.c
|
|
+++ b/ext/pcre/pcrelib/pcre_xclass.c
|
|
@@ -246,7 +246,7 @@ while ((t = *data++) != XCL_END)
|
|
|
|
case PT_PXPUNCT:
|
|
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
|
|
- (c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
|
|
+ (c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
|
|
return !negated;
|
|
break;
|
|
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitConfig.h b/ext/pcre/pcrelib/sljit/sljitConfig.h
|
|
index 10364c3..1c8a521 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitConfig.h
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitConfig.h
|
|
@@ -96,6 +96,15 @@
|
|
#define SLJIT_EXECUTABLE_ALLOCATOR 1
|
|
#endif
|
|
|
|
+/* Force cdecl calling convention even if a better calling
|
|
+ convention (e.g. fastcall) is supported by the C compiler.
|
|
+ If this option is enabled, C functions without
|
|
+ SLJIT_CALL can also be called from JIT code. */
|
|
+#ifndef SLJIT_USE_CDECL_CALLING_CONVENTION
|
|
+/* Disabled by default */
|
|
+#define SLJIT_USE_CDECL_CALLING_CONVENTION 0
|
|
+#endif
|
|
+
|
|
/* Return with error when an invalid argument is passed. */
|
|
#ifndef SLJIT_ARGUMENT_CHECKS
|
|
/* Disabled by default */
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitConfigInternal.h b/ext/pcre/pcrelib/sljit/sljitConfigInternal.h
|
|
index 3284012..16e3547 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitConfigInternal.h
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitConfigInternal.h
|
|
@@ -468,7 +468,12 @@ typedef double sljit_d;
|
|
|
|
#ifndef SLJIT_CALL
|
|
|
|
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
|
|
+#if (defined SLJIT_USE_CDECL_CALLING_CONVENTION && SLJIT_USE_CDECL_CALLING_CONVENTION)
|
|
+
|
|
+/* Force cdecl. */
|
|
+#define SLJIT_CALL
|
|
+
|
|
+#elif (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
|
|
|
|
#if defined(__GNUC__) && !defined(__APPLE__)
|
|
|
|
@@ -608,6 +613,12 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void);
|
|
#define SLJIT_LOCALS_OFFSET_BASE ((23 + 1) * sizeof(sljit_sw))
|
|
#endif
|
|
|
|
+#elif (defined SLJIT_CONFIG_TILEGX && SLJIT_CONFIG_TILEGX)
|
|
+
|
|
+#define SLJIT_NUMBER_OF_REGISTERS 10
|
|
+#define SLJIT_NUMBER_OF_SAVED_REGISTERS 5
|
|
+#define SLJIT_LOCALS_OFFSET_BASE 0
|
|
+
|
|
#elif (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED)
|
|
|
|
#define SLJIT_NUMBER_OF_REGISTERS 0
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitLir.c b/ext/pcre/pcrelib/sljit/sljitLir.c
|
|
index 5039a7e..0f1b1c9 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitLir.c
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitLir.c
|
|
@@ -845,8 +845,8 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_compiler_verbose(struct sljit_compiler *comp
|
|
}
|
|
|
|
static SLJIT_CONST char* op0_names[] = {
|
|
- (char*)"breakpoint", (char*)"nop",
|
|
- (char*)"lumul", (char*)"lsmul", (char*)"ludiv", (char*)"lsdiv",
|
|
+ (char*)"breakpoint", (char*)"nop", (char*)"lumul", (char*)"lsmul",
|
|
+ (char*)"udivmod", (char*)"sdivmod", (char*)"udivi", (char*)"sdivi"
|
|
};
|
|
|
|
static SLJIT_CONST char* op1_names[] = {
|
|
@@ -1036,7 +1036,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op0(struct sljit_compiler
|
|
{
|
|
#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
|
|
CHECK_ARGUMENT((op >= SLJIT_BREAKPOINT && op <= SLJIT_LSMUL)
|
|
- || ((op & ~SLJIT_INT_OP) >= SLJIT_LUDIV && (op & ~SLJIT_INT_OP) <= SLJIT_LSDIV));
|
|
+ || ((op & ~SLJIT_INT_OP) >= SLJIT_UDIVMOD && (op & ~SLJIT_INT_OP) <= SLJIT_SDIVI));
|
|
CHECK_ARGUMENT(op < SLJIT_LUMUL || compiler->scratches >= 2);
|
|
#endif
|
|
#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
|
|
@@ -1447,6 +1447,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_flags(struct sljit_com
|
|
|
|
static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_local_base(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw offset)
|
|
{
|
|
+ SLJIT_UNUSED_ARG(offset);
|
|
+
|
|
#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
|
|
FUNCTION_CHECK_DST(dst, dstw);
|
|
#endif
|
|
@@ -1462,6 +1464,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_local_base(struct sljit_co
|
|
|
|
static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value)
|
|
{
|
|
+ SLJIT_UNUSED_ARG(init_value);
|
|
+
|
|
#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
|
|
FUNCTION_CHECK_DST(dst, dstw);
|
|
#endif
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitLir.h b/ext/pcre/pcrelib/sljit/sljitLir.h
|
|
index 24c0f60..2e2e9ac09 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitLir.h
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitLir.h
|
|
@@ -687,7 +687,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler *
|
|
#define SLJIT_OP0_BASE 0
|
|
|
|
/* Flags: - (never set any flags)
|
|
- Note: breakpoint instruction is not supported by all architectures (namely ppc)
|
|
+ Note: breakpoint instruction is not supported by all architectures (e.g. ppc)
|
|
It falls back to SLJIT_NOP in those cases. */
|
|
#define SLJIT_BREAKPOINT (SLJIT_OP0_BASE + 0)
|
|
/* Flags: - (never set any flags)
|
|
@@ -696,24 +696,42 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler *
|
|
#define SLJIT_NOP (SLJIT_OP0_BASE + 1)
|
|
/* Flags: - (may destroy flags)
|
|
Unsigned multiplication of SLJIT_R0 and SLJIT_R1.
|
|
- Result goes to SLJIT_R1:SLJIT_R0 (high:low) word */
|
|
+ Result is placed into SLJIT_R1:SLJIT_R0 (high:low) word */
|
|
#define SLJIT_LUMUL (SLJIT_OP0_BASE + 2)
|
|
/* Flags: - (may destroy flags)
|
|
Signed multiplication of SLJIT_R0 and SLJIT_R1.
|
|
- Result goes to SLJIT_R1:SLJIT_R0 (high:low) word */
|
|
+ Result is placed into SLJIT_R1:SLJIT_R0 (high:low) word */
|
|
#define SLJIT_LSMUL (SLJIT_OP0_BASE + 3)
|
|
/* Flags: I - (may destroy flags)
|
|
Unsigned divide of the value in SLJIT_R0 by the value in SLJIT_R1.
|
|
- The result is placed in SLJIT_R0 and the remainder goes to SLJIT_R1.
|
|
- Note: if SLJIT_R1 contains 0, the behaviour is undefined. */
|
|
-#define SLJIT_LUDIV (SLJIT_OP0_BASE + 4)
|
|
-#define SLJIT_ILUDIV (SLJIT_LUDIV | SLJIT_INT_OP)
|
|
+ The result is placed into SLJIT_R0 and the remainder into SLJIT_R1.
|
|
+ Note: if SLJIT_R1 is 0, the behaviour is undefined. */
|
|
+#define SLJIT_UDIVMOD (SLJIT_OP0_BASE + 4)
|
|
+#define SLJIT_IUDIVMOD (SLJIT_UDIVMOD | SLJIT_INT_OP)
|
|
/* Flags: I - (may destroy flags)
|
|
Signed divide of the value in SLJIT_R0 by the value in SLJIT_R1.
|
|
- The result is placed in SLJIT_R0 and the remainder goes to SLJIT_R1.
|
|
- Note: if SLJIT_R1 contains 0, the behaviour is undefined. */
|
|
-#define SLJIT_LSDIV (SLJIT_OP0_BASE + 5)
|
|
-#define SLJIT_ILSDIV (SLJIT_LSDIV | SLJIT_INT_OP)
|
|
+ The result is placed into SLJIT_R0 and the remainder into SLJIT_R1.
|
|
+ Note: if SLJIT_R1 is 0, the behaviour is undefined.
|
|
+ Note: if SLJIT_R1 is -1 and SLJIT_R0 is integer min (0x800..00),
|
|
+ the behaviour is undefined. */
|
|
+#define SLJIT_SDIVMOD (SLJIT_OP0_BASE + 5)
|
|
+#define SLJIT_ISDIVMOD (SLJIT_SDIVMOD | SLJIT_INT_OP)
|
|
+/* Flags: I - (may destroy flags)
|
|
+ Unsigned divide of the value in SLJIT_R0 by the value in SLJIT_R1.
|
|
+ The result is placed into SLJIT_R0. SLJIT_R1 preserves its value.
|
|
+ Note: if SLJIT_R1 is 0, the behaviour is undefined.
|
|
+ Note: SLJIT_SDIV is single precision divide. */
|
|
+#define SLJIT_UDIVI (SLJIT_OP0_BASE + 6)
|
|
+#define SLJIT_IUDIVI (SLJIT_UDIVI | SLJIT_INT_OP)
|
|
+/* Flags: I - (may destroy flags)
|
|
+ Signed divide of the value in SLJIT_R0 by the value in SLJIT_R1.
|
|
+ The result is placed into SLJIT_R0. SLJIT_R1 preserves its value.
|
|
+ Note: if SLJIT_R1 is 0, the behaviour is undefined.
|
|
+ Note: if SLJIT_R1 is -1 and SLJIT_R0 is integer min (0x800..00),
|
|
+ the behaviour is undefined.
|
|
+ Note: SLJIT_SDIV is single precision divide. */
|
|
+#define SLJIT_SDIVI (SLJIT_OP0_BASE + 7)
|
|
+#define SLJIT_ISDIVI (SLJIT_SDIVI | SLJIT_INT_OP)
|
|
|
|
SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op);
|
|
|
|
@@ -851,34 +869,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler
|
|
sljit_si src1, sljit_sw src1w,
|
|
sljit_si src2, sljit_sw src2w);
|
|
|
|
-/* The following function is a helper function for sljit_emit_op_custom.
|
|
- It returns with the real machine register index ( >=0 ) of any SLJIT_R,
|
|
- SLJIT_S and SLJIT_SP registers.
|
|
-
|
|
- Note: it returns with -1 for virtual registers (only on x86-32). */
|
|
-
|
|
-SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg);
|
|
-
|
|
-/* The following function is a helper function for sljit_emit_op_custom.
|
|
- It returns with the real machine register index of any SLJIT_FLOAT register.
|
|
-
|
|
- Note: the index is always an even number on ARM (except ARM-64), MIPS, and SPARC. */
|
|
-
|
|
-SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg);
|
|
-
|
|
-/* Any instruction can be inserted into the instruction stream by
|
|
- sljit_emit_op_custom. It has a similar purpose as inline assembly.
|
|
- The size parameter must match to the instruction size of the target
|
|
- architecture:
|
|
-
|
|
- x86: 0 < size <= 15. The instruction argument can be byte aligned.
|
|
- Thumb2: if size == 2, the instruction argument must be 2 byte aligned.
|
|
- if size == 4, the instruction argument must be 4 byte aligned.
|
|
- Otherwise: size must be 4 and instruction argument must be 4 byte aligned. */
|
|
-
|
|
-SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
|
|
- void *instruction, sljit_si size);
|
|
-
|
|
/* Returns with non-zero if fpu is available. */
|
|
|
|
SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_is_fpu_available(void);
|
|
@@ -1196,4 +1186,64 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_set_function_context(void** func_ptr, struct
|
|
|
|
#endif /* !(defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL) */
|
|
|
|
+/* --------------------------------------------------------------------- */
|
|
+/* CPU specific functions */
|
|
+/* --------------------------------------------------------------------- */
|
|
+
|
|
+/* The following function is a helper function for sljit_emit_op_custom.
|
|
+ It returns with the real machine register index ( >=0 ) of any SLJIT_R,
|
|
+ SLJIT_S and SLJIT_SP registers.
|
|
+
|
|
+ Note: it returns with -1 for virtual registers (only on x86-32). */
|
|
+
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg);
|
|
+
|
|
+/* The following function is a helper function for sljit_emit_op_custom.
|
|
+ It returns with the real machine register index of any SLJIT_FLOAT register.
|
|
+
|
|
+ Note: the index is always an even number on ARM (except ARM-64), MIPS, and SPARC. */
|
|
+
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg);
|
|
+
|
|
+/* Any instruction can be inserted into the instruction stream by
|
|
+ sljit_emit_op_custom. It has a similar purpose as inline assembly.
|
|
+ The size parameter must match to the instruction size of the target
|
|
+ architecture:
|
|
+
|
|
+ x86: 0 < size <= 15. The instruction argument can be byte aligned.
|
|
+ Thumb2: if size == 2, the instruction argument must be 2 byte aligned.
|
|
+ if size == 4, the instruction argument must be 4 byte aligned.
|
|
+ Otherwise: size must be 4 and instruction argument must be 4 byte aligned. */
|
|
+
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
|
|
+ void *instruction, sljit_si size);
|
|
+
|
|
+#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
|
|
+
|
|
+/* Returns with non-zero if sse2 is available. */
|
|
+
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_sse2_available(void);
|
|
+
|
|
+/* Returns with non-zero if cmov instruction is available. */
|
|
+
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_cmov_available(void);
|
|
+
|
|
+/* Emit a conditional mov instruction on x86 CPUs. This instruction
|
|
+ moves src to destination, if the condition is satisfied. Unlike
|
|
+ other arithmetic instructions, destination must be a register.
|
|
+ Before such instructions are emitted, cmov support should be
|
|
+ checked by sljit_x86_is_cmov_available function.
|
|
+ type must be between SLJIT_EQUAL and SLJIT_S_ORDERED
|
|
+ dst_reg must be a valid register and it can be combined
|
|
+ with SLJIT_INT_OP to perform 32 bit arithmetic
|
|
+ Flags: I - (never set any flags)
|
|
+ */
|
|
+
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_emit_cmov(struct sljit_compiler *compiler,
|
|
+ sljit_si type,
|
|
+ sljit_si dst_reg,
|
|
+ sljit_si src, sljit_sw srcw);
|
|
+
|
|
+#endif
|
|
+
|
|
#endif /* _SLJIT_LIR_H_ */
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c b/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c
|
|
index aca1d31..5cd4c71 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c
|
|
@@ -1833,18 +1833,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
| (reg_map[SLJIT_R0] << 8)
|
|
| reg_map[TMP_REG1]);
|
|
#endif
|
|
- case SLJIT_LUDIV:
|
|
- case SLJIT_LSDIV:
|
|
- if (compiler->scratches >= 3)
|
|
+ case SLJIT_UDIVMOD:
|
|
+ case SLJIT_SDIVMOD:
|
|
+ case SLJIT_UDIVI:
|
|
+ case SLJIT_SDIVI:
|
|
+ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
|
|
+ SLJIT_COMPILE_ASSERT(reg_map[2] == 1 && reg_map[3] == 2, bad_register_mapping);
|
|
+
|
|
+ if ((op >= SLJIT_UDIVI) && (compiler->scratches >= 3)) {
|
|
FAIL_IF(push_inst(compiler, 0xe52d2008 /* str r2, [sp, #-8]! */));
|
|
+ FAIL_IF(push_inst(compiler, 0xe58d1004 /* str r1, [sp, #4] */));
|
|
+ }
|
|
+ else if ((op >= SLJIT_UDIVI) || (compiler->scratches >= 3))
|
|
+ FAIL_IF(push_inst(compiler, 0xe52d0008 | (op >= SLJIT_UDIVI ? 0x1000 : 0x2000) /* str r1/r2, [sp, #-8]! */));
|
|
+
|
|
#if defined(__GNUC__)
|
|
FAIL_IF(sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM,
|
|
- (op == SLJIT_LUDIV ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod))));
|
|
+ ((op | 0x2) == SLJIT_UDIVI ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod))));
|
|
#else
|
|
#error "Software divmod functions are needed"
|
|
#endif
|
|
- if (compiler->scratches >= 3)
|
|
- return push_inst(compiler, 0xe49d2008 /* ldr r2, [sp], #8 */);
|
|
+
|
|
+ if ((op >= SLJIT_UDIVI) && (compiler->scratches >= 3)) {
|
|
+ FAIL_IF(push_inst(compiler, 0xe59d1004 /* ldr r1, [sp, #4] */));
|
|
+ FAIL_IF(push_inst(compiler, 0xe49d2008 /* ldr r2, [sp], #8 */));
|
|
+ }
|
|
+ else if ((op >= SLJIT_UDIVI) || (compiler->scratches >= 3))
|
|
+ return push_inst(compiler, 0xe49d0008 | (op >= SLJIT_UDIVI ? 0x1000 : 0x2000) /* ldr r1/r2, [sp], #8 */);
|
|
return SLJIT_SUCCESS;
|
|
}
|
|
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c b/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c
|
|
index b66455f..044a675 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c
|
|
@@ -1087,14 +1087,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil
|
|
saved_regs_size += sizeof(sljit_sw);
|
|
}
|
|
local_size -= saved_regs_size + SLJIT_LOCALS_OFFSET;
|
|
- FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10)));
|
|
+ if (saved_regs_size > 0)
|
|
+ FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10)));
|
|
}
|
|
|
|
tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG;
|
|
prev = -1;
|
|
for (i = SLJIT_S0; i >= tmp; i--) {
|
|
if (prev == -1) {
|
|
- prev = i;
|
|
+ if (!(offs & (1 << 15))) {
|
|
+ prev = i;
|
|
+ continue;
|
|
+ }
|
|
+ FAIL_IF(push_inst(compiler, STRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
|
|
+ offs += 1 << 15;
|
|
continue;
|
|
}
|
|
FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
|
|
@@ -1104,7 +1110,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil
|
|
|
|
for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
|
|
if (prev == -1) {
|
|
- prev = i;
|
|
+ if (!(offs & (1 << 15))) {
|
|
+ prev = i;
|
|
+ continue;
|
|
+ }
|
|
+ FAIL_IF(push_inst(compiler, STRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
|
|
+ offs += 1 << 15;
|
|
continue;
|
|
}
|
|
FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
|
|
@@ -1112,8 +1123,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil
|
|
prev = -1;
|
|
}
|
|
|
|
- if (prev != -1)
|
|
- FAIL_IF(push_inst(compiler, STRI | RT(prev) | RN(TMP_SP) | (offs >> 5)));
|
|
+ SLJIT_ASSERT(prev == -1);
|
|
|
|
if (compiler->local_size > (63 * sizeof(sljit_sw))) {
|
|
/* The local_size is already adjusted by the saved registers. */
|
|
@@ -1188,7 +1198,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi
|
|
prev = -1;
|
|
for (i = SLJIT_S0; i >= tmp; i--) {
|
|
if (prev == -1) {
|
|
- prev = i;
|
|
+ if (!(offs & (1 << 15))) {
|
|
+ prev = i;
|
|
+ continue;
|
|
+ }
|
|
+ FAIL_IF(push_inst(compiler, LDRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
|
|
+ offs += 1 << 15;
|
|
continue;
|
|
}
|
|
FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
|
|
@@ -1198,7 +1213,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi
|
|
|
|
for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
|
|
if (prev == -1) {
|
|
- prev = i;
|
|
+ if (!(offs & (1 << 15))) {
|
|
+ prev = i;
|
|
+ continue;
|
|
+ }
|
|
+ FAIL_IF(push_inst(compiler, LDRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
|
|
+ offs += 1 << 15;
|
|
continue;
|
|
}
|
|
FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
|
|
@@ -1206,13 +1226,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi
|
|
prev = -1;
|
|
}
|
|
|
|
- if (prev != -1)
|
|
- FAIL_IF(push_inst(compiler, LDRI | RT(prev) | RN(TMP_SP) | (offs >> 5)));
|
|
+ SLJIT_ASSERT(prev == -1);
|
|
|
|
if (compiler->local_size <= (63 * sizeof(sljit_sw))) {
|
|
FAIL_IF(push_inst(compiler, LDP_PST | 29 | RT2(TMP_LR)
|
|
| RN(TMP_SP) | (((local_size >> 3) & 0x7f) << 15)));
|
|
- } else {
|
|
+ } else if (saved_regs_size > 0) {
|
|
FAIL_IF(push_inst(compiler, ADDI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10)));
|
|
}
|
|
|
|
@@ -1242,12 +1261,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
FAIL_IF(push_inst(compiler, ORR | RD(TMP_REG1) | RN(TMP_ZERO) | RM(SLJIT_R0)));
|
|
FAIL_IF(push_inst(compiler, MADD | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1) | RT2(TMP_ZERO)));
|
|
return push_inst(compiler, (op == SLJIT_LUMUL ? UMULH : SMULH) | RD(SLJIT_R1) | RN(TMP_REG1) | RM(SLJIT_R1));
|
|
- case SLJIT_LUDIV:
|
|
- case SLJIT_LSDIV:
|
|
+ case SLJIT_UDIVMOD:
|
|
+ case SLJIT_SDIVMOD:
|
|
FAIL_IF(push_inst(compiler, (ORR ^ inv_bits) | RD(TMP_REG1) | RN(TMP_ZERO) | RM(SLJIT_R0)));
|
|
- FAIL_IF(push_inst(compiler, ((op == SLJIT_LUDIV ? UDIV : SDIV) ^ inv_bits) | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1)));
|
|
+ FAIL_IF(push_inst(compiler, ((op == SLJIT_UDIVMOD ? UDIV : SDIV) ^ inv_bits) | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1)));
|
|
FAIL_IF(push_inst(compiler, (MADD ^ inv_bits) | RD(SLJIT_R1) | RN(SLJIT_R0) | RM(SLJIT_R1) | RT2(TMP_ZERO)));
|
|
return push_inst(compiler, (SUB ^ inv_bits) | RD(SLJIT_R1) | RN(TMP_REG1) | RM(SLJIT_R1));
|
|
+ case SLJIT_UDIVI:
|
|
+ case SLJIT_SDIVI:
|
|
+ return push_inst(compiler, ((op == SLJIT_UDIVI ? UDIV : SDIV) ^ inv_bits) | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1));
|
|
}
|
|
|
|
return SLJIT_SUCCESS;
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c b/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c
|
|
index 6e38cec..f9803f5 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c
|
|
@@ -1239,6 +1239,9 @@ extern int __aeabi_idivmod(int numerator, int denominator);
|
|
|
|
SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op)
|
|
{
|
|
+ sljit_sw saved_reg_list[3];
|
|
+ sljit_sw saved_reg_count;
|
|
+
|
|
CHECK_ERROR();
|
|
CHECK(check_sljit_emit_op0(compiler, op));
|
|
|
|
@@ -1255,24 +1258,53 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
| (reg_map[SLJIT_R0] << 12)
|
|
| (reg_map[SLJIT_R0] << 16)
|
|
| reg_map[SLJIT_R1]);
|
|
- case SLJIT_LUDIV:
|
|
- case SLJIT_LSDIV:
|
|
- if (compiler->scratches >= 4) {
|
|
- FAIL_IF(push_inst32(compiler, 0xf84d2d04 /* str r2, [sp, #-4]! */));
|
|
- FAIL_IF(push_inst32(compiler, 0xf84dcd04 /* str ip, [sp, #-4]! */));
|
|
- } else if (compiler->scratches >= 3)
|
|
- FAIL_IF(push_inst32(compiler, 0xf84d2d08 /* str r2, [sp, #-8]! */));
|
|
+ case SLJIT_UDIVMOD:
|
|
+ case SLJIT_SDIVMOD:
|
|
+ case SLJIT_UDIVI:
|
|
+ case SLJIT_SDIVI:
|
|
+ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
|
|
+ SLJIT_COMPILE_ASSERT(reg_map[2] == 1 && reg_map[3] == 2 && reg_map[4] == 12, bad_register_mapping);
|
|
+
|
|
+ saved_reg_count = 0;
|
|
+ if (compiler->scratches >= 4)
|
|
+ saved_reg_list[saved_reg_count++] = 12;
|
|
+ if (compiler->scratches >= 3)
|
|
+ saved_reg_list[saved_reg_count++] = 2;
|
|
+ if (op >= SLJIT_UDIVI)
|
|
+ saved_reg_list[saved_reg_count++] = 1;
|
|
+
|
|
+ if (saved_reg_count > 0) {
|
|
+ FAIL_IF(push_inst32(compiler, 0xf84d0d00 | (saved_reg_count >= 3 ? 16 : 8)
|
|
+ | (saved_reg_list[0] << 12) /* str rX, [sp, #-8/-16]! */));
|
|
+ if (saved_reg_count >= 2) {
|
|
+ SLJIT_ASSERT(saved_reg_list[1] < 8);
|
|
+ FAIL_IF(push_inst16(compiler, 0x9001 | (saved_reg_list[1] << 8) /* str rX, [sp, #4] */));
|
|
+ }
|
|
+ if (saved_reg_count >= 3) {
|
|
+ SLJIT_ASSERT(saved_reg_list[2] < 8);
|
|
+ FAIL_IF(push_inst16(compiler, 0x9002 | (saved_reg_list[2] << 8) /* str rX, [sp, #8] */));
|
|
+ }
|
|
+ }
|
|
+
|
|
#if defined(__GNUC__)
|
|
FAIL_IF(sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM,
|
|
- (op == SLJIT_LUDIV ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod))));
|
|
+ ((op | 0x2) == SLJIT_UDIVI ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod))));
|
|
#else
|
|
#error "Software divmod functions are needed"
|
|
#endif
|
|
- if (compiler->scratches >= 4) {
|
|
- FAIL_IF(push_inst32(compiler, 0xf85dcb04 /* ldr ip, [sp], #4 */));
|
|
- return push_inst32(compiler, 0xf85d2b04 /* ldr r2, [sp], #4 */);
|
|
- } else if (compiler->scratches >= 3)
|
|
- return push_inst32(compiler, 0xf85d2b08 /* ldr r2, [sp], #8 */);
|
|
+
|
|
+ if (saved_reg_count > 0) {
|
|
+ if (saved_reg_count >= 3) {
|
|
+ SLJIT_ASSERT(saved_reg_list[2] < 8);
|
|
+ FAIL_IF(push_inst16(compiler, 0x9802 | (saved_reg_list[2] << 8) /* ldr rX, [sp, #8] */));
|
|
+ }
|
|
+ if (saved_reg_count >= 2) {
|
|
+ SLJIT_ASSERT(saved_reg_list[1] < 8);
|
|
+ FAIL_IF(push_inst16(compiler, 0x9801 | (saved_reg_list[1] << 8) /* ldr rX, [sp, #4] */));
|
|
+ }
|
|
+ return push_inst32(compiler, 0xf85d0b00 | (saved_reg_count >= 3 ? 16 : 8)
|
|
+ | (saved_reg_list[0] << 12) /* ldr rX, [sp], #8/16 */);
|
|
+ }
|
|
return SLJIT_SUCCESS;
|
|
}
|
|
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c b/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c
|
|
index 3e2c9f0..cf3535f 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c
|
|
@@ -1053,8 +1053,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
#endif
|
|
FAIL_IF(push_inst(compiler, MFLO | D(SLJIT_R0), DR(SLJIT_R0)));
|
|
return push_inst(compiler, MFHI | D(SLJIT_R1), DR(SLJIT_R1));
|
|
- case SLJIT_LUDIV:
|
|
- case SLJIT_LSDIV:
|
|
+ case SLJIT_UDIVMOD:
|
|
+ case SLJIT_SDIVMOD:
|
|
+ case SLJIT_UDIVI:
|
|
+ case SLJIT_SDIVI:
|
|
+ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
|
|
#if !(defined SLJIT_MIPS_R1 && SLJIT_MIPS_R1)
|
|
FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
|
|
FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
|
|
@@ -1062,15 +1065,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
|
|
#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
|
|
if (int_op)
|
|
- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
|
|
+ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
|
|
else
|
|
- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DDIVU : DDIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
|
|
+ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? DDIVU : DDIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
|
|
#else
|
|
- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
|
|
+ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
|
|
#endif
|
|
|
|
FAIL_IF(push_inst(compiler, MFLO | D(SLJIT_R0), DR(SLJIT_R0)));
|
|
- return push_inst(compiler, MFHI | D(SLJIT_R1), DR(SLJIT_R1));
|
|
+ return (op >= SLJIT_UDIVI) ? SLJIT_SUCCESS : push_inst(compiler, MFHI | D(SLJIT_R1), DR(SLJIT_R1));
|
|
}
|
|
|
|
return SLJIT_SUCCESS;
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c b/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c
|
|
index 08d5356..b6a043f 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c
|
|
@@ -1267,22 +1267,23 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
FAIL_IF(push_inst(compiler, MULLW | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1)));
|
|
return push_inst(compiler, (op == SLJIT_LUMUL ? MULHWU : MULHW) | D(SLJIT_R1) | A(TMP_REG1) | B(SLJIT_R1));
|
|
#endif
|
|
- case SLJIT_LUDIV:
|
|
- case SLJIT_LSDIV:
|
|
+ case SLJIT_UDIVMOD:
|
|
+ case SLJIT_SDIVMOD:
|
|
FAIL_IF(push_inst(compiler, OR | S(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R0)));
|
|
#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
|
|
- if (int_op) {
|
|
- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVWU : DIVW) | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1)));
|
|
- FAIL_IF(push_inst(compiler, MULLW | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1)));
|
|
- } else {
|
|
- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVDU : DIVD) | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1)));
|
|
- FAIL_IF(push_inst(compiler, MULLD | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1)));
|
|
- }
|
|
- return push_inst(compiler, SUBF | D(SLJIT_R1) | A(SLJIT_R1) | B(TMP_REG1));
|
|
+ FAIL_IF(push_inst(compiler, (int_op ? (op == SLJIT_UDIVMOD ? DIVWU : DIVW) : (op == SLJIT_UDIVMOD ? DIVDU : DIVD)) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1)));
|
|
+ FAIL_IF(push_inst(compiler, (int_op ? MULLW : MULLD) | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1)));
|
|
#else
|
|
- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVWU : DIVW) | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1)));
|
|
+ FAIL_IF(push_inst(compiler, (op == SLJIT_UDIVMOD ? DIVWU : DIVW) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1)));
|
|
FAIL_IF(push_inst(compiler, MULLW | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1)));
|
|
+#endif
|
|
return push_inst(compiler, SUBF | D(SLJIT_R1) | A(SLJIT_R1) | B(TMP_REG1));
|
|
+ case SLJIT_UDIVI:
|
|
+ case SLJIT_SDIVI:
|
|
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
|
|
+ return push_inst(compiler, (int_op ? (op == SLJIT_UDIVI ? DIVWU : DIVW) : (op == SLJIT_UDIVI ? DIVDU : DIVD)) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1));
|
|
+#else
|
|
+ return push_inst(compiler, (op == SLJIT_UDIVI ? DIVWU : DIVW) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1));
|
|
#endif
|
|
}
|
|
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c b/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c
|
|
index 0b1927a..327c426 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c
|
|
@@ -777,20 +777,25 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
#else
|
|
#error "Implementation required"
|
|
#endif
|
|
- case SLJIT_LUDIV:
|
|
- case SLJIT_LSDIV:
|
|
+ case SLJIT_UDIVMOD:
|
|
+ case SLJIT_SDIVMOD:
|
|
+ case SLJIT_UDIVI:
|
|
+ case SLJIT_SDIVI:
|
|
+ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
|
|
#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
|
|
- if (op == SLJIT_LUDIV)
|
|
+ if ((op | 0x2) == SLJIT_UDIVI)
|
|
FAIL_IF(push_inst(compiler, WRY | S1(0), MOVABLE_INS));
|
|
else {
|
|
FAIL_IF(push_inst(compiler, SRA | D(TMP_REG1) | S1(SLJIT_R0) | IMM(31), DR(TMP_REG1)));
|
|
FAIL_IF(push_inst(compiler, WRY | S1(TMP_REG1), MOVABLE_INS));
|
|
}
|
|
- FAIL_IF(push_inst(compiler, OR | D(TMP_REG2) | S1(0) | S2(SLJIT_R0), DR(TMP_REG2)));
|
|
- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? UDIV : SDIV) | D(SLJIT_R0) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R0)));
|
|
+ if (op <= SLJIT_SDIVMOD)
|
|
+ FAIL_IF(push_inst(compiler, OR | D(TMP_REG2) | S1(0) | S2(SLJIT_R0), DR(TMP_REG2)));
|
|
+ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? UDIV : SDIV) | D(SLJIT_R0) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R0)));
|
|
+ if (op >= SLJIT_UDIVI)
|
|
+ return SLJIT_SUCCESS;
|
|
FAIL_IF(push_inst(compiler, SMUL | D(SLJIT_R1) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R1)));
|
|
- FAIL_IF(push_inst(compiler, SUB | D(SLJIT_R1) | S1(TMP_REG2) | S2(SLJIT_R1), DR(SLJIT_R1)));
|
|
- return SLJIT_SUCCESS;
|
|
+ return push_inst(compiler, SUB | D(SLJIT_R1) | S1(TMP_REG2) | S2(SLJIT_R1), DR(SLJIT_R1));
|
|
#else
|
|
#error "Implementation required"
|
|
#endif
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c b/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c
|
|
index 1d6aa5a..4d40392f 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c
|
|
@@ -35,21 +35,21 @@
|
|
#define SIMM_16BIT_MIN (-0x8000)
|
|
#define SIMM_17BIT_MAX (0xffff)
|
|
#define SIMM_17BIT_MIN (-0x10000)
|
|
-#define SIMM_32BIT_MIN (-0x80000000)
|
|
#define SIMM_32BIT_MAX (0x7fffffff)
|
|
-#define SIMM_48BIT_MIN (0x800000000000L)
|
|
+#define SIMM_32BIT_MIN (-0x7fffffff - 1)
|
|
#define SIMM_48BIT_MAX (0x7fffffff0000L)
|
|
+#define SIMM_48BIT_MIN (-0x800000000000L)
|
|
#define IMM16(imm) ((imm) & 0xffff)
|
|
|
|
#define UIMM_16BIT_MAX (0xffff)
|
|
|
|
-#define TMP_REG1 (SLJIT_NO_REGISTERS + 1)
|
|
-#define TMP_REG2 (SLJIT_NO_REGISTERS + 2)
|
|
-#define TMP_REG3 (SLJIT_NO_REGISTERS + 3)
|
|
-#define ADDR_TMP (SLJIT_NO_REGISTERS + 4)
|
|
+#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2)
|
|
+#define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3)
|
|
+#define TMP_REG3 (SLJIT_NUMBER_OF_REGISTERS + 4)
|
|
+#define ADDR_TMP (SLJIT_NUMBER_OF_REGISTERS + 5)
|
|
#define PIC_ADDR_REG TMP_REG2
|
|
|
|
-static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 5] = {
|
|
+static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 6] = {
|
|
63, 0, 1, 2, 3, 4, 30, 31, 32, 33, 34, 54, 5, 16, 6, 7
|
|
};
|
|
|
|
@@ -58,11 +58,6 @@ static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 5] = {
|
|
#define TMP_REG2_mapped 16
|
|
#define TMP_REG3_mapped 6
|
|
#define ADDR_TMP_mapped 7
|
|
-#define SLJIT_SAVED_REG1_mapped 30
|
|
-#define SLJIT_SAVED_REG2_mapped 31
|
|
-#define SLJIT_SAVED_REG3_mapped 32
|
|
-#define SLJIT_SAVED_EREG1_mapped 33
|
|
-#define SLJIT_SAVED_EREG2_mapped 34
|
|
|
|
/* Flags are keept in volatile registers. */
|
|
#define EQUAL_FLAG 8
|
|
@@ -399,6 +394,9 @@ static sljit_si push_inst(struct sljit_compiler *compiler, sljit_ins ins)
|
|
#define SUB(dst, srca, srcb) \
|
|
push_3_buffer(compiler, TILEGX_OPC_SUB, dst, srca, srcb, __LINE__)
|
|
|
|
+#define MUL(dst, srca, srcb) \
|
|
+ push_3_buffer(compiler, TILEGX_OPC_MULX, dst, srca, srcb, __LINE__)
|
|
+
|
|
#define NOR(dst, srca, srcb) \
|
|
push_3_buffer(compiler, TILEGX_OPC_NOR, dst, srca, srcb, __LINE__)
|
|
|
|
@@ -547,8 +545,8 @@ const struct Format* compute_format()
|
|
|
|
const struct Format* match = NULL;
|
|
const struct Format *b = NULL;
|
|
- unsigned int i = 0;
|
|
- for (i; i < sizeof formats / sizeof formats[0]; i++) {
|
|
+ unsigned int i;
|
|
+ for (i = 0; i < sizeof formats / sizeof formats[0]; i++) {
|
|
b = &formats[i];
|
|
if ((b->pipe_mask & compatible_pipes) == b->pipe_mask) {
|
|
match = b;
|
|
@@ -625,7 +623,6 @@ tilegx_bundle_bits get_bundle_bit(struct jit_instr *inst)
|
|
|
|
static sljit_si update_buffer(struct sljit_compiler *compiler)
|
|
{
|
|
- int count;
|
|
int i;
|
|
int orig_index = inst_buf_index;
|
|
struct jit_instr inst0 = inst_buf[0];
|
|
@@ -738,8 +735,10 @@ static sljit_si update_buffer(struct sljit_compiler *compiler)
|
|
|
|
static sljit_si flush_buffer(struct sljit_compiler *compiler)
|
|
{
|
|
- while (inst_buf_index != 0)
|
|
- update_buffer(compiler);
|
|
+ while (inst_buf_index != 0) {
|
|
+ FAIL_IF(update_buffer(compiler));
|
|
+ }
|
|
+ return SLJIT_SUCCESS;
|
|
}
|
|
|
|
static sljit_si push_4_buffer(struct sljit_compiler *compiler, tilegx_mnemonic opc, int op0, int op1, int op2, int op3, int line)
|
|
@@ -787,6 +786,7 @@ static sljit_si push_3_buffer(struct sljit_compiler *compiler, tilegx_mnemonic o
|
|
case TILEGX_OPC_ADD:
|
|
case TILEGX_OPC_AND:
|
|
case TILEGX_OPC_SUB:
|
|
+ case TILEGX_OPC_MULX:
|
|
case TILEGX_OPC_OR:
|
|
case TILEGX_OPC_XOR:
|
|
case TILEGX_OPC_NOR:
|
|
@@ -905,7 +905,6 @@ static SLJIT_INLINE sljit_ins * detect_jump_type(struct sljit_jump *jump, sljit_
|
|
sljit_sw diff;
|
|
sljit_uw target_addr;
|
|
sljit_ins *inst;
|
|
- sljit_ins saved_inst;
|
|
|
|
if (jump->flags & SLJIT_REWRITABLE_JUMP)
|
|
return code_ptr;
|
|
@@ -1009,7 +1008,7 @@ SLJIT_API_FUNC_ATTRIBUTE void * sljit_generate_code(struct sljit_compiler *compi
|
|
struct sljit_const *const_;
|
|
|
|
CHECK_ERROR_PTR();
|
|
- check_sljit_generate_code(compiler);
|
|
+ CHECK_PTR(check_sljit_generate_code(compiler));
|
|
reverse_buf(compiler);
|
|
|
|
code = (sljit_ins *)SLJIT_MALLOC_EXEC(compiler->size * sizeof(sljit_ins));
|
|
@@ -1178,13 +1177,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil
|
|
sljit_si fscratches, sljit_si fsaveds, sljit_si local_size)
|
|
{
|
|
sljit_ins base;
|
|
- sljit_ins bundle = 0;
|
|
-
|
|
+ sljit_si i, tmp;
|
|
+
|
|
CHECK_ERROR();
|
|
- check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
|
|
+ CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
|
|
set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
|
|
|
|
- local_size += (saveds + 1) * sizeof(sljit_sw);
|
|
+ local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
|
|
local_size = (local_size + 7) & ~7;
|
|
compiler->local_size = local_size;
|
|
|
|
@@ -1200,56 +1199,52 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil
|
|
local_size = 0;
|
|
}
|
|
|
|
+ /* Save the return address. */
|
|
FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 8));
|
|
FAIL_IF(ST_ADD(ADDR_TMP_mapped, RA, -8));
|
|
|
|
- if (saveds >= 1)
|
|
- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_REG1_mapped, -8));
|
|
-
|
|
- if (saveds >= 2)
|
|
- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_REG2_mapped, -8));
|
|
-
|
|
- if (saveds >= 3)
|
|
- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_REG3_mapped, -8));
|
|
-
|
|
- if (saveds >= 4)
|
|
- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_EREG1_mapped, -8));
|
|
-
|
|
- if (saveds >= 5)
|
|
- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_EREG2_mapped, -8));
|
|
-
|
|
- if (args >= 1)
|
|
- FAIL_IF(ADD(SLJIT_SAVED_REG1_mapped, 0, ZERO));
|
|
+ /* Save the S registers. */
|
|
+ tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG;
|
|
+ for (i = SLJIT_S0; i >= tmp; i--) {
|
|
+ FAIL_IF(ST_ADD(ADDR_TMP_mapped, reg_map[i], -8));
|
|
+ }
|
|
|
|
- if (args >= 2)
|
|
- FAIL_IF(ADD(SLJIT_SAVED_REG2_mapped, 1, ZERO));
|
|
+ /* Save the R registers that need to be reserved. */
|
|
+ for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
|
|
+ FAIL_IF(ST_ADD(ADDR_TMP_mapped, reg_map[i], -8));
|
|
+ }
|
|
|
|
- if (args >= 3)
|
|
- FAIL_IF(ADD(SLJIT_SAVED_REG3_mapped, 2, ZERO));
|
|
+ /* Move the arguments to S registers. */
|
|
+ for (i = 0; i < args; i++) {
|
|
+ FAIL_IF(ADD(reg_map[SLJIT_S0 - i], i, ZERO));
|
|
+ }
|
|
|
|
return SLJIT_SUCCESS;
|
|
}
|
|
|
|
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_context(struct sljit_compiler *compiler,
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_set_context(struct sljit_compiler *compiler,
|
|
sljit_si options, sljit_si args, sljit_si scratches, sljit_si saveds,
|
|
sljit_si fscratches, sljit_si fsaveds, sljit_si local_size)
|
|
{
|
|
- CHECK_ERROR_VOID();
|
|
- check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
|
|
+ CHECK_ERROR();
|
|
+ CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
|
|
set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
|
|
|
|
- local_size += (saveds + 1) * sizeof(sljit_sw);
|
|
+ local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
|
|
compiler->local_size = (local_size + 7) & ~7;
|
|
+
|
|
+ return SLJIT_SUCCESS;
|
|
}
|
|
|
|
SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compiler, sljit_si op, sljit_si src, sljit_sw srcw)
|
|
{
|
|
sljit_si local_size;
|
|
sljit_ins base;
|
|
- int addr_initialized = 0;
|
|
+ sljit_si i, tmp;
|
|
+ sljit_si saveds;
|
|
|
|
CHECK_ERROR();
|
|
- check_sljit_emit_return(compiler, op, src, srcw);
|
|
+ CHECK(check_sljit_emit_return(compiler, op, src, srcw));
|
|
|
|
FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
|
|
|
|
@@ -1263,50 +1258,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi
|
|
local_size = 0;
|
|
}
|
|
|
|
+ /* Restore the return address. */
|
|
FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 8));
|
|
- FAIL_IF(LD(RA, ADDR_TMP_mapped));
|
|
-
|
|
- if (compiler->saveds >= 5) {
|
|
- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 48));
|
|
- addr_initialized = 1;
|
|
+ FAIL_IF(LD_ADD(RA, ADDR_TMP_mapped, -8));
|
|
|
|
- FAIL_IF(LD_ADD(SLJIT_SAVED_EREG2_mapped, ADDR_TMP_mapped, 8));
|
|
+ /* Restore the S registers. */
|
|
+ saveds = compiler->saveds;
|
|
+ tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG;
|
|
+ for (i = SLJIT_S0; i >= tmp; i--) {
|
|
+ FAIL_IF(LD_ADD(reg_map[i], ADDR_TMP_mapped, -8));
|
|
}
|
|
|
|
- if (compiler->saveds >= 4) {
|
|
- if (addr_initialized == 0) {
|
|
- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 40));
|
|
- addr_initialized = 1;
|
|
- }
|
|
-
|
|
- FAIL_IF(LD_ADD(SLJIT_SAVED_EREG1_mapped, ADDR_TMP_mapped, 8));
|
|
- }
|
|
-
|
|
- if (compiler->saveds >= 3) {
|
|
- if (addr_initialized == 0) {
|
|
- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 32));
|
|
- addr_initialized = 1;
|
|
- }
|
|
-
|
|
- FAIL_IF(LD_ADD(SLJIT_SAVED_REG3_mapped, ADDR_TMP_mapped, 8));
|
|
- }
|
|
-
|
|
- if (compiler->saveds >= 2) {
|
|
- if (addr_initialized == 0) {
|
|
- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 24));
|
|
- addr_initialized = 1;
|
|
- }
|
|
-
|
|
- FAIL_IF(LD_ADD(SLJIT_SAVED_REG2_mapped, ADDR_TMP_mapped, 8));
|
|
- }
|
|
-
|
|
- if (compiler->saveds >= 1) {
|
|
- if (addr_initialized == 0) {
|
|
- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 16));
|
|
- /* addr_initialized = 1; no need to initialize as it's the last one. */
|
|
- }
|
|
-
|
|
- FAIL_IF(LD_ADD(SLJIT_SAVED_REG1_mapped, ADDR_TMP_mapped, 8));
|
|
+ /* Restore the R registers that need to be reserved. */
|
|
+ for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
|
|
+ FAIL_IF(LD_ADD(reg_map[i], ADDR_TMP_mapped, -8));
|
|
}
|
|
|
|
if (compiler->local_size <= SIMM_16BIT_MAX)
|
|
@@ -1585,7 +1550,7 @@ static SLJIT_INLINE sljit_si emit_op_mem2(struct sljit_compiler *compiler, sljit
|
|
SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw)
|
|
{
|
|
CHECK_ERROR();
|
|
- check_sljit_emit_fast_enter(compiler, dst, dstw);
|
|
+ CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
|
|
ADJUST_LOCAL_OFFSET(dst, dstw);
|
|
|
|
/* For UNUSED dst. Uncommon, but possible. */
|
|
@@ -1602,7 +1567,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_enter(struct sljit_compiler *c
|
|
SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_si src, sljit_sw srcw)
|
|
{
|
|
CHECK_ERROR();
|
|
- check_sljit_emit_fast_return(compiler, src, srcw);
|
|
+ CHECK(check_sljit_emit_fast_return(compiler, src, srcw));
|
|
ADJUST_LOCAL_OFFSET(src, srcw);
|
|
|
|
if (FAST_IS_REG(src))
|
|
@@ -1636,9 +1601,11 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
|
|
if (op == SLJIT_MOV_SI)
|
|
return BFEXTS(reg_map[dst], reg_map[src2], 0, 31);
|
|
|
|
- return BFEXTU(reg_map[dst], reg_map[src2], 0, 31);
|
|
- } else if (dst != src2)
|
|
- SLJIT_ASSERT_STOP();
|
|
+ return BFEXTU(reg_map[dst], reg_map[src2], 0, 31);
|
|
+ } else if (dst != src2) {
|
|
+ SLJIT_ASSERT(src2 == 0);
|
|
+ return ADD(reg_map[dst], reg_map[src2], ZERO);
|
|
+ }
|
|
|
|
return SLJIT_SUCCESS;
|
|
|
|
@@ -1650,8 +1617,10 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
|
|
return BFEXTS(reg_map[dst], reg_map[src2], 0, 7);
|
|
|
|
return BFEXTU(reg_map[dst], reg_map[src2], 0, 7);
|
|
- } else if (dst != src2)
|
|
- SLJIT_ASSERT_STOP();
|
|
+ } else if (dst != src2) {
|
|
+ SLJIT_ASSERT(src2 == 0);
|
|
+ return ADD(reg_map[dst], reg_map[src2], ZERO);
|
|
+ }
|
|
|
|
return SLJIT_SUCCESS;
|
|
|
|
@@ -1663,8 +1632,10 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
|
|
return BFEXTS(reg_map[dst], reg_map[src2], 0, 15);
|
|
|
|
return BFEXTU(reg_map[dst], reg_map[src2], 0, 15);
|
|
- } else if (dst != src2)
|
|
- SLJIT_ASSERT_STOP();
|
|
+ } else if (dst != src2) {
|
|
+ SLJIT_ASSERT(src2 == 0);
|
|
+ return ADD(reg_map[dst], reg_map[src2], ZERO);
|
|
+ }
|
|
|
|
return SLJIT_SUCCESS;
|
|
|
|
@@ -1811,7 +1782,6 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
|
|
else {
|
|
/* Rare ocasion. */
|
|
FAIL_IF(ADD(TMP_EREG2, reg_map[src1], ZERO));
|
|
-
|
|
overflow_ra = TMP_EREG2;
|
|
}
|
|
}
|
|
@@ -1903,6 +1873,17 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
|
|
|
|
return SLJIT_SUCCESS;
|
|
|
|
+ case SLJIT_MUL:
|
|
+ if (flags & SRC2_IMM) {
|
|
+ FAIL_IF(load_immediate(compiler, TMP_REG2_mapped, src2));
|
|
+ src2 = TMP_REG2;
|
|
+ flags &= ~SRC2_IMM;
|
|
+ }
|
|
+
|
|
+ FAIL_IF(MUL(reg_map[dst], reg_map[src1], reg_map[src2]));
|
|
+
|
|
+ return SLJIT_SUCCESS;
|
|
+
|
|
#define EMIT_LOGICAL(op_imm, op_norm) \
|
|
if (flags & SRC2_IMM) { \
|
|
FAIL_IF(load_immediate(compiler, ADDR_TMP_mapped, src2)); \
|
|
@@ -1950,8 +1931,8 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
|
|
} else { \
|
|
if (op & SLJIT_SET_E) \
|
|
FAIL_IF(push_3_buffer( \
|
|
- compiler, op_imm, reg_map[dst], reg_map[src1], \
|
|
- src2 & 0x3F, __LINE__)); \
|
|
+ compiler, op_norm, EQUAL_FLAG, reg_map[src1], \
|
|
+ reg_map[src2], __LINE__)); \
|
|
if (CHECK_FLAGS(SLJIT_SET_E)) \
|
|
FAIL_IF(push_3_buffer( \
|
|
compiler, op_norm, reg_map[dst], reg_map[src1], \
|
|
@@ -2105,66 +2086,61 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *com
|
|
{
|
|
sljit_si sugg_dst_ar, dst_ar;
|
|
sljit_si flags = GET_ALL_FLAGS(op);
|
|
+ sljit_si mem_type = (op & SLJIT_INT_OP) ? (INT_DATA | SIGNED_DATA) : WORD_DATA;
|
|
|
|
CHECK_ERROR();
|
|
- check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type);
|
|
+ CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
|
|
ADJUST_LOCAL_OFFSET(dst, dstw);
|
|
|
|
if (dst == SLJIT_UNUSED)
|
|
return SLJIT_SUCCESS;
|
|
|
|
op = GET_OPCODE(op);
|
|
+ if (op == SLJIT_MOV_SI || op == SLJIT_MOV_UI)
|
|
+ mem_type = INT_DATA | SIGNED_DATA;
|
|
sugg_dst_ar = reg_map[(op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2];
|
|
|
|
compiler->cache_arg = 0;
|
|
compiler->cache_argw = 0;
|
|
if (op >= SLJIT_ADD && (src & SLJIT_MEM)) {
|
|
ADJUST_LOCAL_OFFSET(src, srcw);
|
|
- FAIL_IF(emit_op_mem2(compiler, WORD_DATA | LOAD_DATA, TMP_REG1_mapped, src, srcw, dst, dstw));
|
|
+ FAIL_IF(emit_op_mem2(compiler, mem_type | LOAD_DATA, TMP_REG1_mapped, src, srcw, dst, dstw));
|
|
src = TMP_REG1;
|
|
srcw = 0;
|
|
}
|
|
|
|
- switch (type) {
|
|
- case SLJIT_C_EQUAL:
|
|
- case SLJIT_C_NOT_EQUAL:
|
|
+ switch (type & 0xff) {
|
|
+ case SLJIT_EQUAL:
|
|
+ case SLJIT_NOT_EQUAL:
|
|
FAIL_IF(CMPLTUI(sugg_dst_ar, EQUAL_FLAG, 1));
|
|
dst_ar = sugg_dst_ar;
|
|
break;
|
|
- case SLJIT_C_LESS:
|
|
- case SLJIT_C_GREATER_EQUAL:
|
|
- case SLJIT_C_FLOAT_LESS:
|
|
- case SLJIT_C_FLOAT_GREATER_EQUAL:
|
|
+ case SLJIT_LESS:
|
|
+ case SLJIT_GREATER_EQUAL:
|
|
dst_ar = ULESS_FLAG;
|
|
break;
|
|
- case SLJIT_C_GREATER:
|
|
- case SLJIT_C_LESS_EQUAL:
|
|
- case SLJIT_C_FLOAT_GREATER:
|
|
- case SLJIT_C_FLOAT_LESS_EQUAL:
|
|
+ case SLJIT_GREATER:
|
|
+ case SLJIT_LESS_EQUAL:
|
|
dst_ar = UGREATER_FLAG;
|
|
break;
|
|
- case SLJIT_C_SIG_LESS:
|
|
- case SLJIT_C_SIG_GREATER_EQUAL:
|
|
+ case SLJIT_SIG_LESS:
|
|
+ case SLJIT_SIG_GREATER_EQUAL:
|
|
dst_ar = LESS_FLAG;
|
|
break;
|
|
- case SLJIT_C_SIG_GREATER:
|
|
- case SLJIT_C_SIG_LESS_EQUAL:
|
|
+ case SLJIT_SIG_GREATER:
|
|
+ case SLJIT_SIG_LESS_EQUAL:
|
|
dst_ar = GREATER_FLAG;
|
|
break;
|
|
- case SLJIT_C_OVERFLOW:
|
|
- case SLJIT_C_NOT_OVERFLOW:
|
|
+ case SLJIT_OVERFLOW:
|
|
+ case SLJIT_NOT_OVERFLOW:
|
|
dst_ar = OVERFLOW_FLAG;
|
|
break;
|
|
- case SLJIT_C_MUL_OVERFLOW:
|
|
- case SLJIT_C_MUL_NOT_OVERFLOW:
|
|
+ case SLJIT_MUL_OVERFLOW:
|
|
+ case SLJIT_MUL_NOT_OVERFLOW:
|
|
FAIL_IF(CMPLTUI(sugg_dst_ar, OVERFLOW_FLAG, 1));
|
|
dst_ar = sugg_dst_ar;
|
|
type ^= 0x1; /* Flip type bit for the XORI below. */
|
|
break;
|
|
- case SLJIT_C_FLOAT_EQUAL:
|
|
- case SLJIT_C_FLOAT_NOT_EQUAL:
|
|
- dst_ar = EQUAL_FLAG;
|
|
- break;
|
|
|
|
default:
|
|
SLJIT_ASSERT_STOP();
|
|
@@ -2180,11 +2156,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *com
|
|
if (op >= SLJIT_ADD) {
|
|
if (TMP_REG2_mapped != dst_ar)
|
|
FAIL_IF(ADD(TMP_REG2_mapped, dst_ar, ZERO));
|
|
- return emit_op(compiler, op | flags, CUMULATIVE_OP | LOGICAL_OP | IMM_OP | ALT_KEEP_CACHE, dst, dstw, src, srcw, TMP_REG2, 0);
|
|
+ return emit_op(compiler, op | flags, mem_type | CUMULATIVE_OP | LOGICAL_OP | IMM_OP | ALT_KEEP_CACHE, dst, dstw, src, srcw, TMP_REG2, 0);
|
|
}
|
|
|
|
if (dst & SLJIT_MEM)
|
|
- return emit_op_mem(compiler, WORD_DATA, dst_ar, dst, dstw);
|
|
+ return emit_op_mem(compiler, mem_type, dst_ar, dst, dstw);
|
|
|
|
if (sugg_dst_ar != dst_ar)
|
|
return ADD(sugg_dst_ar, dst_ar, ZERO);
|
|
@@ -2194,7 +2170,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *com
|
|
|
|
SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op) {
|
|
CHECK_ERROR();
|
|
- check_sljit_emit_op0(compiler, op);
|
|
+ CHECK(check_sljit_emit_op0(compiler, op));
|
|
|
|
op = GET_OPCODE(op);
|
|
switch (op) {
|
|
@@ -2204,10 +2180,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
case SLJIT_BREAKPOINT:
|
|
return PI(BPT);
|
|
|
|
- case SLJIT_UMUL:
|
|
- case SLJIT_SMUL:
|
|
- case SLJIT_UDIV:
|
|
- case SLJIT_SDIV:
|
|
+ case SLJIT_LUMUL:
|
|
+ case SLJIT_LSMUL:
|
|
+ case SLJIT_UDIVI:
|
|
+ case SLJIT_SDIVI:
|
|
SLJIT_ASSERT_STOP();
|
|
}
|
|
|
|
@@ -2217,7 +2193,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler, sljit_si op, sljit_si dst, sljit_sw dstw, sljit_si src, sljit_sw srcw)
|
|
{
|
|
CHECK_ERROR();
|
|
- check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw);
|
|
+ CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
|
|
ADJUST_LOCAL_OFFSET(dst, dstw);
|
|
ADJUST_LOCAL_OFFSET(src, srcw);
|
|
|
|
@@ -2273,7 +2249,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler
|
|
return emit_op(compiler, SLJIT_SUB | GET_ALL_FLAGS(op), IMM_OP, dst, dstw, SLJIT_IMM, 0, src, srcw);
|
|
|
|
case SLJIT_CLZ:
|
|
- return emit_op(compiler, op, 0, dst, dstw, TMP_REG1, 0, src, srcw);
|
|
+ return emit_op(compiler, op, (op & SLJIT_INT_OP) ? INT_DATA : WORD_DATA, dst, dstw, TMP_REG1, 0, src, srcw);
|
|
}
|
|
|
|
return SLJIT_SUCCESS;
|
|
@@ -2282,7 +2258,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler
|
|
SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler, sljit_si op, sljit_si dst, sljit_sw dstw, sljit_si src1, sljit_sw src1w, sljit_si src2, sljit_sw src2w)
|
|
{
|
|
CHECK_ERROR();
|
|
- check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
|
|
+ CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
|
|
ADJUST_LOCAL_OFFSET(dst, dstw);
|
|
ADJUST_LOCAL_OFFSET(src1, src1w);
|
|
ADJUST_LOCAL_OFFSET(src2, src2w);
|
|
@@ -2325,7 +2301,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_label * sljit_emit_label(struct sljit_comp
|
|
flush_buffer(compiler);
|
|
|
|
CHECK_ERROR_PTR();
|
|
- check_sljit_emit_label(compiler);
|
|
+ CHECK_PTR(check_sljit_emit_label(compiler));
|
|
|
|
if (compiler->last_label && compiler->last_label->size == compiler->size)
|
|
return compiler->last_label;
|
|
@@ -2344,7 +2320,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compil
|
|
flush_buffer(compiler);
|
|
|
|
CHECK_ERROR();
|
|
- check_sljit_emit_ijump(compiler, type, src, srcw);
|
|
+ CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
|
|
ADJUST_LOCAL_OFFSET(src, srcw);
|
|
|
|
if (FAST_IS_REG(src)) {
|
|
@@ -2404,8 +2380,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compil
|
|
|
|
return SLJIT_SUCCESS;
|
|
|
|
- } else if (src & SLJIT_MEM)
|
|
+ } else if (src & SLJIT_MEM) {
|
|
FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_REG2, 0, TMP_REG1, 0, src, srcw));
|
|
+ flush_buffer(compiler);
|
|
+ }
|
|
|
|
FAIL_IF(JR_SOLO(reg_map[src_r]));
|
|
|
|
@@ -2432,7 +2410,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump * sljit_emit_jump(struct sljit_compil
|
|
flush_buffer(compiler);
|
|
|
|
CHECK_ERROR_PTR();
|
|
- check_sljit_emit_jump(compiler, type);
|
|
+ CHECK_PTR(check_sljit_emit_jump(compiler, type));
|
|
|
|
jump = (struct sljit_jump *)ensure_abuf(compiler, sizeof(struct sljit_jump));
|
|
PTR_FAIL_IF(!jump);
|
|
@@ -2440,48 +2418,42 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump * sljit_emit_jump(struct sljit_compil
|
|
type &= 0xff;
|
|
|
|
switch (type) {
|
|
- case SLJIT_C_EQUAL:
|
|
- case SLJIT_C_FLOAT_NOT_EQUAL:
|
|
+ case SLJIT_EQUAL:
|
|
BR_NZ(EQUAL_FLAG);
|
|
break;
|
|
- case SLJIT_C_NOT_EQUAL:
|
|
- case SLJIT_C_FLOAT_EQUAL:
|
|
+ case SLJIT_NOT_EQUAL:
|
|
BR_Z(EQUAL_FLAG);
|
|
break;
|
|
- case SLJIT_C_LESS:
|
|
- case SLJIT_C_FLOAT_LESS:
|
|
+ case SLJIT_LESS:
|
|
BR_Z(ULESS_FLAG);
|
|
break;
|
|
- case SLJIT_C_GREATER_EQUAL:
|
|
- case SLJIT_C_FLOAT_GREATER_EQUAL:
|
|
+ case SLJIT_GREATER_EQUAL:
|
|
BR_NZ(ULESS_FLAG);
|
|
break;
|
|
- case SLJIT_C_GREATER:
|
|
- case SLJIT_C_FLOAT_GREATER:
|
|
+ case SLJIT_GREATER:
|
|
BR_Z(UGREATER_FLAG);
|
|
break;
|
|
- case SLJIT_C_LESS_EQUAL:
|
|
- case SLJIT_C_FLOAT_LESS_EQUAL:
|
|
+ case SLJIT_LESS_EQUAL:
|
|
BR_NZ(UGREATER_FLAG);
|
|
break;
|
|
- case SLJIT_C_SIG_LESS:
|
|
+ case SLJIT_SIG_LESS:
|
|
BR_Z(LESS_FLAG);
|
|
break;
|
|
- case SLJIT_C_SIG_GREATER_EQUAL:
|
|
+ case SLJIT_SIG_GREATER_EQUAL:
|
|
BR_NZ(LESS_FLAG);
|
|
break;
|
|
- case SLJIT_C_SIG_GREATER:
|
|
+ case SLJIT_SIG_GREATER:
|
|
BR_Z(GREATER_FLAG);
|
|
break;
|
|
- case SLJIT_C_SIG_LESS_EQUAL:
|
|
+ case SLJIT_SIG_LESS_EQUAL:
|
|
BR_NZ(GREATER_FLAG);
|
|
break;
|
|
- case SLJIT_C_OVERFLOW:
|
|
- case SLJIT_C_MUL_OVERFLOW:
|
|
+ case SLJIT_OVERFLOW:
|
|
+ case SLJIT_MUL_OVERFLOW:
|
|
BR_Z(OVERFLOW_FLAG);
|
|
break;
|
|
- case SLJIT_C_NOT_OVERFLOW:
|
|
- case SLJIT_C_MUL_NOT_OVERFLOW:
|
|
+ case SLJIT_NOT_OVERFLOW:
|
|
+ case SLJIT_MUL_NOT_OVERFLOW:
|
|
BR_NZ(OVERFLOW_FLAG);
|
|
break;
|
|
default:
|
|
@@ -2536,7 +2508,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_const * sljit_emit_const(struct sljit_comp
|
|
flush_buffer(compiler);
|
|
|
|
CHECK_ERROR_PTR();
|
|
- check_sljit_emit_const(compiler, dst, dstw, init_value);
|
|
+ CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
|
|
ADJUST_LOCAL_OFFSET(dst, dstw);
|
|
|
|
const_ = (struct sljit_const *)ensure_abuf(compiler, sizeof(struct sljit_const));
|
|
@@ -2572,3 +2544,18 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_consta
|
|
inst[3] = (inst[3] & ~(0xFFFFL << 43)) | ((new_constant & 0xFFFFL) << 43);
|
|
SLJIT_CACHE_FLUSH(inst, inst + 4);
|
|
}
|
|
+
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg)
|
|
+{
|
|
+ CHECK_REG_INDEX(check_sljit_get_register_index(reg));
|
|
+ return reg_map[reg];
|
|
+}
|
|
+
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
|
|
+ void *instruction, sljit_si size)
|
|
+{
|
|
+ CHECK_ERROR();
|
|
+ CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
|
|
+ return SLJIT_ERR_UNSUPPORTED;
|
|
+}
|
|
+
|
|
diff --git a/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c b/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c
|
|
index 22a163f..416c15a 100644
|
|
--- a/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c
|
|
+++ b/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c
|
|
@@ -273,7 +273,9 @@ static sljit_si cpu_has_sse2 = -1;
|
|
#endif
|
|
static sljit_si cpu_has_cmov = -1;
|
|
|
|
-#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
+#ifdef _WIN32_WCE
|
|
+#include <cmnintrin.h>
|
|
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#include <intrin.h>
|
|
#endif
|
|
|
|
@@ -742,8 +744,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
break;
|
|
case SLJIT_LUMUL:
|
|
case SLJIT_LSMUL:
|
|
- case SLJIT_LUDIV:
|
|
- case SLJIT_LSDIV:
|
|
+ case SLJIT_UDIVMOD:
|
|
+ case SLJIT_SDIVMOD:
|
|
+ case SLJIT_UDIVI:
|
|
+ case SLJIT_SDIVI:
|
|
compiler->flags_saved = 0;
|
|
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
|
|
#ifdef _WIN64
|
|
@@ -761,9 +765,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
#endif
|
|
compiler->mode32 = op & SLJIT_INT_OP;
|
|
#endif
|
|
+ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
|
|
|
|
op = GET_OPCODE(op);
|
|
- if (op == SLJIT_LUDIV) {
|
|
+ if ((op | 0x2) == SLJIT_UDIVI) {
|
|
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
|
|
EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
|
|
inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
|
|
@@ -774,7 +779,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
*inst = XOR_r_rm;
|
|
}
|
|
|
|
- if (op == SLJIT_LSDIV) {
|
|
+ if ((op | 0x2) == SLJIT_SDIVI) {
|
|
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
|
|
EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
|
|
#endif
|
|
@@ -805,10 +810,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
FAIL_IF(!inst);
|
|
INC_SIZE(2);
|
|
*inst++ = GROUP_F7;
|
|
- *inst = MOD_REG | ((op >= SLJIT_LUDIV) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
|
|
+ *inst = MOD_REG | ((op >= SLJIT_UDIVMOD) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
|
|
#else
|
|
#ifdef _WIN64
|
|
- size = (!compiler->mode32 || op >= SLJIT_LUDIV) ? 3 : 2;
|
|
+ size = (!compiler->mode32 || op >= SLJIT_UDIVMOD) ? 3 : 2;
|
|
#else
|
|
size = (!compiler->mode32) ? 3 : 2;
|
|
#endif
|
|
@@ -817,11 +822,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
INC_SIZE(size);
|
|
#ifdef _WIN64
|
|
if (!compiler->mode32)
|
|
- *inst++ = REX_W | ((op >= SLJIT_LUDIV) ? REX_B : 0);
|
|
- else if (op >= SLJIT_LUDIV)
|
|
+ *inst++ = REX_W | ((op >= SLJIT_UDIVMOD) ? REX_B : 0);
|
|
+ else if (op >= SLJIT_UDIVMOD)
|
|
*inst++ = REX_B;
|
|
*inst++ = GROUP_F7;
|
|
- *inst = MOD_REG | ((op >= SLJIT_LUDIV) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
|
|
+ *inst = MOD_REG | ((op >= SLJIT_UDIVMOD) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
|
|
#else
|
|
if (!compiler->mode32)
|
|
*inst++ = REX_W;
|
|
@@ -836,15 +841,21 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
|
|
case SLJIT_LSMUL:
|
|
*inst |= IMUL;
|
|
break;
|
|
- case SLJIT_LUDIV:
|
|
+ case SLJIT_UDIVMOD:
|
|
+ case SLJIT_UDIVI:
|
|
*inst |= DIV;
|
|
break;
|
|
- case SLJIT_LSDIV:
|
|
+ case SLJIT_SDIVMOD:
|
|
+ case SLJIT_SDIVI:
|
|
*inst |= IDIV;
|
|
break;
|
|
}
|
|
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
|
|
- EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
|
|
+ if (op <= SLJIT_SDIVMOD)
|
|
+ EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
|
|
+#else
|
|
+ if (op >= SLJIT_UDIVI)
|
|
+ EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
|
|
#endif
|
|
break;
|
|
}
|
|
@@ -1905,60 +1916,62 @@ static sljit_si emit_test_binary(struct sljit_compiler *compiler,
|
|
return SLJIT_SUCCESS;
|
|
}
|
|
|
|
- if (FAST_IS_REG(src1)) {
|
|
+ if (!(src1 & SLJIT_IMM)) {
|
|
if (src2 & SLJIT_IMM) {
|
|
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
|
|
if (IS_HALFWORD(src2w) || compiler->mode32) {
|
|
- inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
|
|
+ inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
|
|
FAIL_IF(!inst);
|
|
*inst = GROUP_F7;
|
|
}
|
|
else {
|
|
FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
|
|
- inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, 0);
|
|
+ inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w);
|
|
FAIL_IF(!inst);
|
|
*inst = TEST_rm_r;
|
|
}
|
|
#else
|
|
- inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
|
|
+ inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
|
|
FAIL_IF(!inst);
|
|
*inst = GROUP_F7;
|
|
#endif
|
|
+ return SLJIT_SUCCESS;
|
|
}
|
|
- else {
|
|
+ else if (FAST_IS_REG(src1)) {
|
|
inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
|
|
FAIL_IF(!inst);
|
|
*inst = TEST_rm_r;
|
|
+ return SLJIT_SUCCESS;
|
|
}
|
|
- return SLJIT_SUCCESS;
|
|
}
|
|
|
|
- if (FAST_IS_REG(src2)) {
|
|
+ if (!(src2 & SLJIT_IMM)) {
|
|
if (src1 & SLJIT_IMM) {
|
|
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
|
|
if (IS_HALFWORD(src1w) || compiler->mode32) {
|
|
- inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, 0);
|
|
+ inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
|
|
FAIL_IF(!inst);
|
|
*inst = GROUP_F7;
|
|
}
|
|
else {
|
|
FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
|
|
- inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, 0);
|
|
+ inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w);
|
|
FAIL_IF(!inst);
|
|
*inst = TEST_rm_r;
|
|
}
|
|
#else
|
|
- inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, 0);
|
|
+ inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
|
|
FAIL_IF(!inst);
|
|
*inst = GROUP_F7;
|
|
#endif
|
|
+ return SLJIT_SUCCESS;
|
|
}
|
|
- else {
|
|
+ else if (FAST_IS_REG(src2)) {
|
|
inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
|
|
FAIL_IF(!inst);
|
|
*inst = TEST_rm_r;
|
|
+ return SLJIT_SUCCESS;
|
|
}
|
|
- return SLJIT_SUCCESS;
|
|
}
|
|
|
|
EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
|
|
@@ -2923,3 +2936,69 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_consta
|
|
{
|
|
*(sljit_sw*)addr = new_constant;
|
|
}
|
|
+
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_sse2_available(void)
|
|
+{
|
|
+#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
|
|
+ if (cpu_has_sse2 == -1)
|
|
+ get_cpu_features();
|
|
+ return cpu_has_sse2;
|
|
+#else
|
|
+ return 1;
|
|
+#endif
|
|
+}
|
|
+
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_cmov_available(void)
|
|
+{
|
|
+ if (cpu_has_cmov == -1)
|
|
+ get_cpu_features();
|
|
+ return cpu_has_cmov;
|
|
+}
|
|
+
|
|
+SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_emit_cmov(struct sljit_compiler *compiler,
|
|
+ sljit_si type,
|
|
+ sljit_si dst_reg,
|
|
+ sljit_si src, sljit_sw srcw)
|
|
+{
|
|
+ sljit_ub* inst;
|
|
+
|
|
+ CHECK_ERROR();
|
|
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
|
|
+ CHECK_ARGUMENT(sljit_x86_is_cmov_available());
|
|
+ CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_INT_OP)));
|
|
+ CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_D_ORDERED);
|
|
+ CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_INT_OP));
|
|
+ FUNCTION_CHECK_SRC(src, srcw);
|
|
+#endif
|
|
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
|
|
+ if (SLJIT_UNLIKELY(!!compiler->verbose)) {
|
|
+ fprintf(compiler->verbose, " x86_cmov%s %s%s, ",
|
|
+ !(dst_reg & SLJIT_INT_OP) ? "" : ".i",
|
|
+ JUMP_PREFIX(type), jump_names[type & 0xff]);
|
|
+ sljit_verbose_reg(compiler, dst_reg & ~SLJIT_INT_OP);
|
|
+ fprintf(compiler->verbose, ", ");
|
|
+ sljit_verbose_param(compiler, src, srcw);
|
|
+ fprintf(compiler->verbose, "\n");
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ ADJUST_LOCAL_OFFSET(src, srcw);
|
|
+ CHECK_EXTRA_REGS(src, srcw, (void)0);
|
|
+
|
|
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
|
|
+ compiler->mode32 = dst_reg & SLJIT_INT_OP;
|
|
+#endif
|
|
+ dst_reg &= ~SLJIT_INT_OP;
|
|
+
|
|
+ if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
|
|
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
|
|
+ src = TMP_REG1;
|
|
+ srcw = 0;
|
|
+ }
|
|
+
|
|
+ inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
|
|
+ FAIL_IF(!inst);
|
|
+ *inst++ = GROUP_0F;
|
|
+ *inst = get_jump_code(type & 0xff) - 0x40;
|
|
+ return SLJIT_SUCCESS;
|
|
+}
|