From 99d075ebbc87c96c18e8f8042db49ff57c5b75c6 Mon Sep 17 00:00:00 2001 From: Remi Collet Date: Tue, 16 Feb 2016 22:54:26 +0100 Subject: php 5.4.45-4 (security fix backported from 5.5.32) --- pcre838.patch | 6665 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 6665 insertions(+) create mode 100644 pcre838.patch (limited to 'pcre838.patch') diff --git a/pcre838.patch b/pcre838.patch new file mode 100644 index 0000000..3b3dcdf --- /dev/null +++ b/pcre838.patch @@ -0,0 +1,6665 @@ +Backported from 5.5 for 5.4 by Remi Collet + + +diff -ru php-5.4.45/ext/pcre/pcrelib/config.h php55/php-5.5.31/ext/pcre/pcrelib/config.h +--- php-5.4.45/ext/pcre/pcrelib/config.h 2015-09-01 22:09:37.000000000 +0200 ++++ php-5.5.31/ext/pcre/pcrelib/config.h 2016-01-06 10:36:49.000000000 +0100 +@@ -302,6 +302,8 @@ + */ + /* #undef NO_RECURSE */ + ++#define PARENS_NEST_LIMIT 250 ++ + /* Name of package */ + #define PACKAGE "pcre" + +diff -ru php54/php-5.4.45/ext/pcre/pcrelib/pcre_exec.c php55/php-5.5.31/ext/pcre/pcrelib/pcre_exec.c +--- php-5.4.45/ext/pcre/pcrelib/pcre_exec.c 2015-09-01 22:09:37.000000000 +0200 ++++ php-5.5.31/ext/pcre/pcrelib/pcre_exec.c 2016-01-06 10:36:49.000000000 +0100 +@@ -688,7 +688,7 @@ + #define foc number + #define save_mark data + +-/* These statements are here to stop the compiler complaining about unitialized ++/* These statements are here to stop the compiler complaining about uninitialized + variables. */ + + #ifdef SUPPORT_UCP + +From ca02d9c2d6f9bea7bf8abe607f1ee9484b1d7b62 Mon Sep 17 00:00:00 2001 +From: Stanislav Malyshev +Date: Sun, 31 Jan 2016 20:33:17 -0800 +Subject: [PATCH] Upgrade bundled PCRE to 8.38 + +--- + NEWS | 3 + + ext/pcre/pcrelib/ChangeLog | 176 ++ + ext/pcre/pcrelib/NEWS | 8 + + ext/pcre/pcrelib/config.h | 11 +- + ext/pcre/pcrelib/doc/pcre.txt | 2130 +++++++++++----------- + ext/pcre/pcrelib/pcre.h | 4 +- + ext/pcre/pcrelib/pcre_compile.c | 334 +++- + ext/pcre/pcrelib/pcre_exec.c | 5 +- + ext/pcre/pcrelib/pcre_internal.h | 17 +- + ext/pcre/pcrelib/pcre_jit_compile.c | 77 +- + ext/pcre/pcrelib/pcre_study.c | 19 +- + ext/pcre/pcrelib/pcre_xclass.c | 2 +- + ext/pcre/pcrelib/sljit/sljitConfig.h | 9 + + ext/pcre/pcrelib/sljit/sljitConfigInternal.h | 13 +- + ext/pcre/pcrelib/sljit/sljitLir.c | 10 +- + ext/pcre/pcrelib/sljit/sljitLir.h | 128 +- + ext/pcre/pcrelib/sljit/sljitNativeARM_32.c | 27 +- + ext/pcre/pcrelib/sljit/sljitNativeARM_64.c | 48 +- + ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c | 58 +- + ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c | 15 +- + ext/pcre/pcrelib/sljit/sljitNativePPC_common.c | 23 +- + ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c | 19 +- + ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c | 311 ++-- + ext/pcre/pcrelib/sljit/sljitNativeX86_common.c | 129 +- + ext/pcre/pcrelib/testdata/grepoutput | 12 + + ext/pcre/pcrelib/testdata/testinput1 | 13 + + ext/pcre/pcrelib/testdata/testinput11 | 4 + + ext/pcre/pcrelib/testdata/testinput12 | 17 + + ext/pcre/pcrelib/testdata/testinput14 | 2 + + ext/pcre/pcrelib/testdata/testinput17 | 2 + + ext/pcre/pcrelib/testdata/testinput2 | 139 ++ + ext/pcre/pcrelib/testdata/testinput4 | 5 + + ext/pcre/pcrelib/testdata/testinput5 | 8 + + ext/pcre/pcrelib/testdata/testinput6 | 57 + + ext/pcre/pcrelib/testdata/testinput7 | 15 + + ext/pcre/pcrelib/testdata/testinput8 | 4 + + ext/pcre/pcrelib/testdata/testinputEBC | 3 + + ext/pcre/pcrelib/testdata/testoutput1 | 23 + + ext/pcre/pcrelib/testdata/testoutput11-16 | 50 +- + ext/pcre/pcrelib/testdata/testoutput11-32 | 50 +- + ext/pcre/pcrelib/testdata/testoutput11-8 | 50 +- + ext/pcre/pcrelib/testdata/testoutput12 | 25 + + ext/pcre/pcrelib/testdata/testoutput14 | 2 + + ext/pcre/pcrelib/testdata/testoutput17 | 2 + + ext/pcre/pcrelib/testdata/testoutput2 | 380 +++- + ext/pcre/pcrelib/testdata/testoutput4 | 6 + + ext/pcre/pcrelib/testdata/testoutput5 | 45 + + ext/pcre/pcrelib/testdata/testoutput6 | 96 + + ext/pcre/pcrelib/testdata/testoutput7 | 57 +- + ext/pcre/pcrelib/testdata/testoutput8 | 6 + + ext/pcre/pcrelib/testdata/testoutputEBC | 6 + + 51 files changed, 3144 insertions(+), 1511 deletions(-) + +diff --git a/ext/pcre/pcrelib/ChangeLog b/ext/pcre/pcrelib/ChangeLog +index 359b412..5e5bf18 100644 +--- a/ext/pcre/pcrelib/ChangeLog ++++ b/ext/pcre/pcrelib/ChangeLog +@@ -1,6 +1,182 @@ + ChangeLog for PCRE + ------------------ + ++Note that the PCRE 8.xx series (PCRE1) is now in a bugfix-only state. All ++development is happening in the PCRE2 10.xx series. ++ ++Version 8.38 23-November-2015 ++----------------------------- ++ ++1. If a group that contained a recursive back reference also contained a ++ forward reference subroutine call followed by a non-forward-reference ++ subroutine call, for example /.((?2)(?R)\1)()/, pcre2_compile() failed to ++ compile correct code, leading to undefined behaviour or an internally ++ detected error. This bug was discovered by the LLVM fuzzer. ++ ++2. Quantification of certain items (e.g. atomic back references) could cause ++ incorrect code to be compiled when recursive forward references were ++ involved. For example, in this pattern: /(?1)()((((((\1++))\x85)+)|))/. ++ This bug was discovered by the LLVM fuzzer. ++ ++3. A repeated conditional group whose condition was a reference by name caused ++ a buffer overflow if there was more than one group with the given name. ++ This bug was discovered by the LLVM fuzzer. ++ ++4. A recursive back reference by name within a group that had the same name as ++ another group caused a buffer overflow. For example: ++ /(?J)(?'d'(?'d'\g{d}))/. This bug was discovered by the LLVM fuzzer. ++ ++5. A forward reference by name to a group whose number is the same as the ++ current group, for example in this pattern: /(?|(\k'Pm')|(?'Pm'))/, caused ++ a buffer overflow at compile time. This bug was discovered by the LLVM ++ fuzzer. ++ ++6. A lookbehind assertion within a set of mutually recursive subpatterns could ++ provoke a buffer overflow. This bug was discovered by the LLVM fuzzer. ++ ++7. Another buffer overflow bug involved duplicate named groups with a ++ reference between their definition, with a group that reset capture ++ numbers, for example: /(?J:(?|(?'R')(\k'R')|((?'R'))))/. This has been ++ fixed by always allowing for more memory, even if not needed. (A proper fix ++ is implemented in PCRE2, but it involves more refactoring.) ++ ++8. There was no check for integer overflow in subroutine calls such as (?123). ++ ++9. The table entry for \l in EBCDIC environments was incorrect, leading to its ++ being treated as a literal 'l' instead of causing an error. ++ ++10. There was a buffer overflow if pcre_exec() was called with an ovector of ++ size 1. This bug was found by american fuzzy lop. ++ ++11. If a non-capturing group containing a conditional group that could match ++ an empty string was repeated, it was not identified as matching an empty ++ string itself. For example: /^(?:(?(1)x|)+)+$()/. ++ ++12. In an EBCDIC environment, pcretest was mishandling the escape sequences ++ \a and \e in test subject lines. ++ ++13. In an EBCDIC environment, \a in a pattern was converted to the ASCII ++ instead of the EBCDIC value. ++ ++14. The handling of \c in an EBCDIC environment has been revised so that it is ++ now compatible with the specification in Perl's perlebcdic page. ++ ++15. The EBCDIC character 0x41 is a non-breaking space, equivalent to 0xa0 in ++ ASCII/Unicode. This has now been added to the list of characters that are ++ recognized as white space in EBCDIC. ++ ++16. When PCRE was compiled without UCP support, the use of \p and \P gave an ++ error (correctly) when used outside a class, but did not give an error ++ within a class. ++ ++17. \h within a class was incorrectly compiled in EBCDIC environments. ++ ++18. A pattern with an unmatched closing parenthesis that contained a backward ++ assertion which itself contained a forward reference caused buffer ++ overflow. And example pattern is: /(?=di(?<=(?1))|(?=(.))))/. ++ ++19. JIT should return with error when the compiled pattern requires more stack ++ space than the maximum. ++ ++20. A possessively repeated conditional group that could match an empty string, ++ for example, /(?(R))*+/, was incorrectly compiled. ++ ++21. Fix infinite recursion in the JIT compiler when certain patterns such as ++ /(?:|a|){100}x/ are analysed. ++ ++22. Some patterns with character classes involving [: and \\ were incorrectly ++ compiled and could cause reading from uninitialized memory or an incorrect ++ error diagnosis. ++ ++23. Pathological patterns containing many nested occurrences of [: caused ++ pcre_compile() to run for a very long time. ++ ++24. A conditional group with only one branch has an implicit empty alternative ++ branch and must therefore be treated as potentially matching an empty ++ string. ++ ++25. If (?R was followed by - or + incorrect behaviour happened instead of a ++ diagnostic. ++ ++26. Arrange to give up on finding the minimum matching length for overly ++ complex patterns. ++ ++27. Similar to (4) above: in a pattern with duplicated named groups and an ++ occurrence of (?| it is possible for an apparently non-recursive back ++ reference to become recursive if a later named group with the relevant ++ number is encountered. This could lead to a buffer overflow. Wen Guanxing ++ from Venustech ADLAB discovered this bug. ++ ++28. If pcregrep was given the -q option with -c or -l, or when handling a ++ binary file, it incorrectly wrote output to stdout. ++ ++29. The JIT compiler did not restore the control verb head in case of *THEN ++ control verbs. This issue was found by Karl Skomski with a custom LLVM ++ fuzzer. ++ ++30. Error messages for syntax errors following \g and \k were giving inaccurate ++ offsets in the pattern. ++ ++31. Added a check for integer overflow in conditions (?() and ++ (?(R). This omission was discovered by Karl Skomski with the LLVM ++ fuzzer. ++ ++32. Handling recursive references such as (?2) when the reference is to a group ++ later in the pattern uses code that is very hacked about and error-prone. ++ It has been re-written for PCRE2. Here in PCRE1, a check has been added to ++ give an internal error if it is obvious that compiling has gone wrong. ++ ++33. The JIT compiler should not check repeats after a {0,1} repeat byte code. ++ This issue was found by Karl Skomski with a custom LLVM fuzzer. ++ ++34. The JIT compiler should restore the control chain for empty possessive ++ repeats. This issue was found by Karl Skomski with a custom LLVM fuzzer. ++ ++35. Match limit check added to JIT recursion. This issue was found by Karl ++ Skomski with a custom LLVM fuzzer. ++ ++36. Yet another case similar to 27 above has been circumvented by an ++ unconditional allocation of extra memory. This issue is fixed "properly" in ++ PCRE2 by refactoring the way references are handled. Wen Guanxing ++ from Venustech ADLAB discovered this bug. ++ ++37. Fix two assertion fails in JIT. These issues were found by Karl Skomski ++ with a custom LLVM fuzzer. ++ ++38. Fixed a corner case of range optimization in JIT. ++ ++39. An incorrect error "overran compiling workspace" was given if there were ++ exactly enough group forward references such that the last one extended ++ into the workspace safety margin. The next one would have expanded the ++ workspace. The test for overflow was not including the safety margin. ++ ++40. A match limit issue is fixed in JIT which was found by Karl Skomski ++ with a custom LLVM fuzzer. ++ ++41. Remove the use of /dev/null in testdata/testinput2, because it doesn't ++ work under Windows. (Why has it taken so long for anyone to notice?) ++ ++42. In a character class such as [\W\p{Any}] where both a negative-type escape ++ ("not a word character") and a property escape were present, the property ++ escape was being ignored. ++ ++43. Fix crash caused by very long (*MARK) or (*THEN) names. ++ ++44. A sequence such as [[:punct:]b] that is, a POSIX character class followed ++ by a single ASCII character in a class item, was incorrectly compiled in ++ UCP mode. The POSIX class got lost, but only if the single character ++ followed it. ++ ++45. [:punct:] in UCP mode was matching some characters in the range 128-255 ++ that should not have been matched. ++ ++46. If [:^ascii:] or [:^xdigit:] or [:^cntrl:] are present in a non-negated ++ class, all characters with code points greater than 255 are in the class. ++ When a Unicode property was also in the class (if PCRE_UCP is set, escapes ++ such as \w are turned into Unicode properties), wide characters were not ++ correctly handled, and could fail to match. ++ ++ + Version 8.37 28-April-2015 + -------------------------- + +diff --git a/ext/pcre/pcrelib/NEWS b/ext/pcre/pcrelib/NEWS +index 064bf27..7e42dcb 100644 +--- a/ext/pcre/pcrelib/NEWS ++++ b/ext/pcre/pcrelib/NEWS +@@ -1,6 +1,14 @@ + News about PCRE releases + ------------------------ + ++Release 8.38 23-November-2015 ++----------------------------- ++ ++This is bug-fix release. Note that this library (now called PCRE1) is now being ++maintained for bug fixes only. New projects are advised to use the new PCRE2 ++libraries. ++ ++ + Release 8.37 28-April-2015 + -------------------------- + +diff --git a/ext/pcre/pcrelib/config.h b/ext/pcre/pcrelib/config.h +index ba06a17..0f7a9f7 100644 +--- a/ext/pcre/pcrelib/config.h ++++ b/ext/pcre/pcrelib/config.h +@@ -234,8 +234,8 @@ them both to 0; an emulation function will be used. */ + #define LINK_SIZE 2 + #endif + +-/* Define to the sub-directory in which libtool stores uninstalled libraries. +- */ ++/* Define to the sub-directory where libtool stores uninstalled libraries. */ ++/* This is ignored unless you are using libtool. */ + #ifndef LT_OBJDIR + #define LT_OBJDIR ".libs/" + #endif +@@ -314,7 +314,7 @@ them both to 0; an emulation function will be used. */ + #define PACKAGE_NAME "PCRE" + + /* Define to the full name and version of this package. */ +-#define PACKAGE_STRING "PCRE 8.37" ++#define PACKAGE_STRING "PCRE 8.38" + + /* Define to the one symbol short name of this package. */ + #define PACKAGE_TARNAME "pcre" +@@ -323,7 +323,7 @@ them both to 0; an emulation function will be used. */ + #define PACKAGE_URL "" + + /* Define to the version of this package. */ +-#define PACKAGE_VERSION "8.37" ++#define PACKAGE_VERSION "8.38" + + /* to make a symbol visible */ + /* #undef PCRECPP_EXP_DECL */ +@@ -439,7 +439,7 @@ them both to 0; an emulation function will be used. */ + + /* Version number of package */ + #ifndef VERSION +-#define VERSION "8.37" ++#define VERSION "8.38" + #endif + + /* Define to empty if `const' does not conform to ANSI C. */ +@@ -451,4 +451,3 @@ them both to 0; an emulation function will be used. */ + + /* Define to `unsigned int' if does not define. */ + /* #undef size_t */ +- +diff --git a/ext/pcre/pcrelib/doc/pcre.txt b/ext/pcre/pcrelib/doc/pcre.txt +index ce27f4b..76a47c7 100644 +--- a/ext/pcre/pcrelib/doc/pcre.txt ++++ b/ext/pcre/pcrelib/doc/pcre.txt +@@ -13,7 +13,18 @@ PCRE(3) Library Functions Manual PCRE(3) + + + NAME +- PCRE - Perl-compatible regular expressions ++ PCRE - Perl-compatible regular expressions (original API) ++ ++PLEASE TAKE NOTE ++ ++ This document relates to PCRE releases that use the original API, with ++ library names libpcre, libpcre16, and libpcre32. January 2015 saw the ++ first release of a new API, known as PCRE2, with release numbers start- ++ ing at 10.00 and library names libpcre2-8, libpcre2-16, and ++ libpcre2-32. The old libraries (now called PCRE1) are still being main- ++ tained for bug fixes, but there will be no new development. New ++ projects are advised to use the new PCRE2 libraries. ++ + + INTRODUCTION + +@@ -179,8 +190,8 @@ AUTHOR + + REVISION + +- Last updated: 08 January 2014 +- Copyright (c) 1997-2014 University of Cambridge. ++ Last updated: 10 February 2015 ++ Copyright (c) 1997-2015 University of Cambridge. + ------------------------------------------------------------------------------ + + +@@ -4989,7 +5000,8 @@ BACKSLASH + appearance of non-printing characters, apart from the binary zero that + terminates a pattern, but when a pattern is being prepared by text + editing, it is often easier to use one of the following escape +- sequences than the binary character it represents: ++ sequences than the binary character it represents. In an ASCII or Uni- ++ code environment, these escapes are as follows: + + \a alarm, that is, the BEL character (hex 07) + \cx "control-x", where x is any ASCII character +@@ -5005,55 +5017,67 @@ BACKSLASH + \x{hhh..} character with hex code hhh.. (non-JavaScript mode) + \uhhhh character with hex code hhhh (JavaScript mode only) + +- The precise effect of \cx on ASCII characters is as follows: if x is a +- lower case letter, it is converted to upper case. Then bit 6 of the ++ The precise effect of \cx on ASCII characters is as follows: if x is a ++ lower case letter, it is converted to upper case. Then bit 6 of the + character (hex 40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A +- (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes +- hex 7B (; is 3B). If the data item (byte or 16-bit value) following \c +- has a value greater than 127, a compile-time error occurs. This locks ++ (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes ++ hex 7B (; is 3B). If the data item (byte or 16-bit value) following \c ++ has a value greater than 127, a compile-time error occurs. This locks + out non-ASCII characters in all modes. + +- The \c facility was designed for use with ASCII characters, but with +- the extension to Unicode it is even less useful than it once was. It +- is, however, recognized when PCRE is compiled in EBCDIC mode, where +- data items are always bytes. In this mode, all values are valid after +- \c. If the next character is a lower case letter, it is converted to +- upper case. Then the 0xc0 bits of the byte are inverted. Thus \cA +- becomes hex 01, as in ASCII (A is C1), but because the EBCDIC letters +- are disjoint, \cZ becomes hex 29 (Z is E9), and other characters also +- generate different values. +- +- After \0 up to two further octal digits are read. If there are fewer +- than two digits, just those that are present are used. Thus the +- sequence \0\x\07 specifies two binary zeros followed by a BEL character +- (code value 7). Make sure you supply two digits after the initial zero ++ When PCRE is compiled in EBCDIC mode, \a, \e, \f, \n, \r, and \t gener- ++ ate the appropriate EBCDIC code values. The \c escape is processed as ++ specified for Perl in the perlebcdic document. The only characters that ++ are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?. ++ Any other character provokes a compile-time error. The sequence \@ ++ encodes character code 0; the letters (in either case) encode charac- ++ ters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 ++ (hex 1B to hex 1F), and \? becomes either 255 (hex FF) or 95 (hex 5F). ++ ++ Thus, apart from \?, these escapes generate the same character code ++ values as they do in an ASCII environment, though the meanings of the ++ values mostly differ. For example, \G always generates code value 7, ++ which is BEL in ASCII but DEL in EBCDIC. ++ ++ The sequence \? generates DEL (127, hex 7F) in an ASCII environment, ++ but because 127 is not a control character in EBCDIC, Perl makes it ++ generate the APC character. Unfortunately, there are several variants ++ of EBCDIC. In most of them the APC character has the value 255 (hex ++ FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If ++ certain other characters have POSIX-BC values, PCRE makes \? generate ++ 95; otherwise it generates 255. ++ ++ After \0 up to two further octal digits are read. If there are fewer ++ than two digits, just those that are present are used. Thus the ++ sequence \0\x\015 specifies two binary zeros followed by a CR character ++ (code value 13). Make sure you supply two digits after the initial zero + if the pattern character that follows is itself an octal digit. + +- The escape \o must be followed by a sequence of octal digits, enclosed +- in braces. An error occurs if this is not the case. This escape is a +- recent addition to Perl; it provides way of specifying character code +- points as octal numbers greater than 0777, and it also allows octal ++ The escape \o must be followed by a sequence of octal digits, enclosed ++ in braces. An error occurs if this is not the case. This escape is a ++ recent addition to Perl; it provides way of specifying character code ++ points as octal numbers greater than 0777, and it also allows octal + numbers and back references to be unambiguously specified. + + For greater clarity and unambiguity, it is best to avoid following \ by + a digit greater than zero. Instead, use \o{} or \x{} to specify charac- +- ter numbers, and \g{} to specify back references. The following para- ++ ter numbers, and \g{} to specify back references. The following para- + graphs describe the old, ambiguous syntax. + + The handling of a backslash followed by a digit other than 0 is compli- +- cated, and Perl has changed in recent releases, causing PCRE also to ++ cated, and Perl has changed in recent releases, causing PCRE also to + change. Outside a character class, PCRE reads the digit and any follow- +- ing digits as a decimal number. If the number is less than 8, or if +- there have been at least that many previous capturing left parentheses +- in the expression, the entire sequence is taken as a back reference. A +- description of how this works is given later, following the discussion ++ ing digits as a decimal number. If the number is less than 8, or if ++ there have been at least that many previous capturing left parentheses ++ in the expression, the entire sequence is taken as a back reference. A ++ description of how this works is given later, following the discussion + of parenthesized subpatterns. + +- Inside a character class, or if the decimal number following \ is ++ Inside a character class, or if the decimal number following \ is + greater than 7 and there have not been that many capturing subpatterns, +- PCRE handles \8 and \9 as the literal characters "8" and "9", and oth- ++ PCRE handles \8 and \9 as the literal characters "8" and "9", and oth- + erwise re-reads up to three octal digits following the backslash, using +- them to generate a data character. Any subsequent digits stand for ++ them to generate a data character. Any subsequent digits stand for + themselves. For example: + + \040 is another way of writing an ASCII space +@@ -5071,31 +5095,31 @@ BACKSLASH + \81 is either a back reference, or the two + characters "8" and "1" + +- Note that octal values of 100 or greater that are specified using this +- syntax must not be introduced by a leading zero, because no more than ++ Note that octal values of 100 or greater that are specified using this ++ syntax must not be introduced by a leading zero, because no more than + three octal digits are ever read. + +- By default, after \x that is not followed by {, from zero to two hexa- +- decimal digits are read (letters can be in upper or lower case). Any ++ By default, after \x that is not followed by {, from zero to two hexa- ++ decimal digits are read (letters can be in upper or lower case). Any + number of hexadecimal digits may appear between \x{ and }. If a charac- +- ter other than a hexadecimal digit appears between \x{ and }, or if ++ ter other than a hexadecimal digit appears between \x{ and }, or if + there is no terminating }, an error occurs. + +- If the PCRE_JAVASCRIPT_COMPAT option is set, the interpretation of \x +- is as just described only when it is followed by two hexadecimal dig- +- its. Otherwise, it matches a literal "x" character. In JavaScript ++ If the PCRE_JAVASCRIPT_COMPAT option is set, the interpretation of \x ++ is as just described only when it is followed by two hexadecimal dig- ++ its. Otherwise, it matches a literal "x" character. In JavaScript + mode, support for code points greater than 256 is provided by \u, which +- must be followed by four hexadecimal digits; otherwise it matches a ++ must be followed by four hexadecimal digits; otherwise it matches a + literal "u" character. + + Characters whose value is less than 256 can be defined by either of the +- two syntaxes for \x (or by \u in JavaScript mode). There is no differ- ++ two syntaxes for \x (or by \u in JavaScript mode). There is no differ- + ence in the way they are handled. For example, \xdc is exactly the same + as \x{dc} (or \u00dc in JavaScript mode). + + Constraints on character values + +- Characters that are specified using octal or hexadecimal numbers are ++ Characters that are specified using octal or hexadecimal numbers are + limited to certain values, as follows: + + 8-bit non-UTF mode less than 0x100 +@@ -5105,44 +5129,44 @@ BACKSLASH + 32-bit non-UTF mode less than 0x100000000 + 32-bit UTF-32 mode less than 0x10ffff and a valid codepoint + +- Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so- ++ Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so- + called "surrogate" codepoints), and 0xffef. + + Escape sequences in character classes + + All the sequences that define a single character value can be used both +- inside and outside character classes. In addition, inside a character ++ inside and outside character classes. In addition, inside a character + class, \b is interpreted as the backspace character (hex 08). + +- \N is not allowed in a character class. \B, \R, and \X are not special +- inside a character class. Like other unrecognized escape sequences, +- they are treated as the literal characters "B", "R", and "X" by +- default, but cause an error if the PCRE_EXTRA option is set. Outside a ++ \N is not allowed in a character class. \B, \R, and \X are not special ++ inside a character class. Like other unrecognized escape sequences, ++ they are treated as the literal characters "B", "R", and "X" by ++ default, but cause an error if the PCRE_EXTRA option is set. Outside a + character class, these sequences have different meanings. + + Unsupported escape sequences + +- In Perl, the sequences \l, \L, \u, and \U are recognized by its string +- handler and used to modify the case of following characters. By +- default, PCRE does not support these escape sequences. However, if the +- PCRE_JAVASCRIPT_COMPAT option is set, \U matches a "U" character, and ++ In Perl, the sequences \l, \L, \u, and \U are recognized by its string ++ handler and used to modify the case of following characters. By ++ default, PCRE does not support these escape sequences. However, if the ++ PCRE_JAVASCRIPT_COMPAT option is set, \U matches a "U" character, and + \u can be used to define a character by code point, as described in the + previous section. + + Absolute and relative back references + +- The sequence \g followed by an unsigned or a negative number, option- +- ally enclosed in braces, is an absolute or relative back reference. A ++ The sequence \g followed by an unsigned or a negative number, option- ++ ally enclosed in braces, is an absolute or relative back reference. A + named back reference can be coded as \g{name}. Back references are dis- + cussed later, following the discussion of parenthesized subpatterns. + + Absolute and relative subroutine calls + +- For compatibility with Oniguruma, the non-Perl syntax \g followed by a ++ For compatibility with Oniguruma, the non-Perl syntax \g followed by a + name or a number enclosed either in angle brackets or single quotes, is +- an alternative syntax for referencing a subpattern as a "subroutine". +- Details are discussed later. Note that \g{...} (Perl syntax) and +- \g<...> (Oniguruma syntax) are not synonymous. The former is a back ++ an alternative syntax for referencing a subpattern as a "subroutine". ++ Details are discussed later. Note that \g{...} (Perl syntax) and ++ \g<...> (Oniguruma syntax) are not synonymous. The former is a back + reference; the latter is a subroutine call. + + Generic character types +@@ -5161,59 +5185,59 @@ BACKSLASH + \W any "non-word" character + + There is also the single sequence \N, which matches a non-newline char- +- acter. This is the same as the "." metacharacter when PCRE_DOTALL is +- not set. Perl also uses \N to match characters by name; PCRE does not ++ acter. This is the same as the "." metacharacter when PCRE_DOTALL is ++ not set. Perl also uses \N to match characters by name; PCRE does not + support this. + +- Each pair of lower and upper case escape sequences partitions the com- +- plete set of characters into two disjoint sets. Any given character +- matches one, and only one, of each pair. The sequences can appear both +- inside and outside character classes. They each match one character of +- the appropriate type. If the current matching point is at the end of +- the subject string, all of them fail, because there is no character to ++ Each pair of lower and upper case escape sequences partitions the com- ++ plete set of characters into two disjoint sets. Any given character ++ matches one, and only one, of each pair. The sequences can appear both ++ inside and outside character classes. They each match one character of ++ the appropriate type. If the current matching point is at the end of ++ the subject string, all of them fail, because there is no character to + match. + +- For compatibility with Perl, \s did not used to match the VT character +- (code 11), which made it different from the the POSIX "space" class. +- However, Perl added VT at release 5.18, and PCRE followed suit at +- release 8.34. The default \s characters are now HT (9), LF (10), VT +- (11), FF (12), CR (13), and space (32), which are defined as white ++ For compatibility with Perl, \s did not used to match the VT character ++ (code 11), which made it different from the the POSIX "space" class. ++ However, Perl added VT at release 5.18, and PCRE followed suit at ++ release 8.34. The default \s characters are now HT (9), LF (10), VT ++ (11), FF (12), CR (13), and space (32), which are defined as white + space in the "C" locale. This list may vary if locale-specific matching +- is taking place. For example, in some locales the "non-breaking space" +- character (\xA0) is recognized as white space, and in others the VT ++ is taking place. For example, in some locales the "non-breaking space" ++ character (\xA0) is recognized as white space, and in others the VT + character is not. + +- A "word" character is an underscore or any character that is a letter +- or digit. By default, the definition of letters and digits is con- +- trolled by PCRE's low-valued character tables, and may vary if locale- +- specific matching is taking place (see "Locale support" in the pcreapi +- page). For example, in a French locale such as "fr_FR" in Unix-like +- systems, or "french" in Windows, some character codes greater than 127 +- are used for accented letters, and these are then matched by \w. The ++ A "word" character is an underscore or any character that is a letter ++ or digit. By default, the definition of letters and digits is con- ++ trolled by PCRE's low-valued character tables, and may vary if locale- ++ specific matching is taking place (see "Locale support" in the pcreapi ++ page). For example, in a French locale such as "fr_FR" in Unix-like ++ systems, or "french" in Windows, some character codes greater than 127 ++ are used for accented letters, and these are then matched by \w. The + use of locales with Unicode is discouraged. + +- By default, characters whose code points are greater than 127 never ++ By default, characters whose code points are greater than 127 never + match \d, \s, or \w, and always match \D, \S, and \W, although this may +- vary for characters in the range 128-255 when locale-specific matching +- is happening. These escape sequences retain their original meanings +- from before Unicode support was available, mainly for efficiency rea- +- sons. If PCRE is compiled with Unicode property support, and the +- PCRE_UCP option is set, the behaviour is changed so that Unicode prop- ++ vary for characters in the range 128-255 when locale-specific matching ++ is happening. These escape sequences retain their original meanings ++ from before Unicode support was available, mainly for efficiency rea- ++ sons. If PCRE is compiled with Unicode property support, and the ++ PCRE_UCP option is set, the behaviour is changed so that Unicode prop- + erties are used to determine character types, as follows: + + \d any character that matches \p{Nd} (decimal digit) + \s any character that matches \p{Z} or \h or \v + \w any character that matches \p{L} or \p{N}, plus underscore + +- The upper case escapes match the inverse sets of characters. Note that +- \d matches only decimal digits, whereas \w matches any Unicode digit, +- as well as any Unicode letter, and underscore. Note also that PCRE_UCP +- affects \b, and \B because they are defined in terms of \w and \W. ++ The upper case escapes match the inverse sets of characters. Note that ++ \d matches only decimal digits, whereas \w matches any Unicode digit, ++ as well as any Unicode letter, and underscore. Note also that PCRE_UCP ++ affects \b, and \B because they are defined in terms of \w and \W. + Matching these sequences is noticeably slower when PCRE_UCP is set. + +- The sequences \h, \H, \v, and \V are features that were added to Perl +- at release 5.10. In contrast to the other sequences, which match only +- ASCII characters by default, these always match certain high-valued ++ The sequences \h, \H, \v, and \V are features that were added to Perl ++ at release 5.10. In contrast to the other sequences, which match only ++ ASCII characters by default, these always match certain high-valued + code points, whether or not PCRE_UCP is set. The horizontal space char- + acters are: + +@@ -5252,110 +5276,110 @@ BACKSLASH + + Newline sequences + +- Outside a character class, by default, the escape sequence \R matches +- any Unicode newline sequence. In 8-bit non-UTF-8 mode \R is equivalent ++ Outside a character class, by default, the escape sequence \R matches ++ any Unicode newline sequence. In 8-bit non-UTF-8 mode \R is equivalent + to the following: + + (?>\r\n|\n|\x0b|\f|\r|\x85) + +- This is an example of an "atomic group", details of which are given ++ This is an example of an "atomic group", details of which are given + below. This particular group matches either the two-character sequence +- CR followed by LF, or one of the single characters LF (linefeed, +- U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (car- +- riage return, U+000D), or NEL (next line, U+0085). The two-character ++ CR followed by LF, or one of the single characters LF (linefeed, ++ U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (car- ++ riage return, U+000D), or NEL (next line, U+0085). The two-character + sequence is treated as a single unit that cannot be split. + +- In other modes, two additional characters whose codepoints are greater ++ In other modes, two additional characters whose codepoints are greater + than 255 are added: LS (line separator, U+2028) and PS (paragraph sepa- +- rator, U+2029). Unicode character property support is not needed for ++ rator, U+2029). Unicode character property support is not needed for + these characters to be recognized. + + It is possible to restrict \R to match only CR, LF, or CRLF (instead of +- the complete set of Unicode line endings) by setting the option ++ the complete set of Unicode line endings) by setting the option + PCRE_BSR_ANYCRLF either at compile time or when the pattern is matched. + (BSR is an abbrevation for "backslash R".) This can be made the default +- when PCRE is built; if this is the case, the other behaviour can be +- requested via the PCRE_BSR_UNICODE option. It is also possible to +- specify these settings by starting a pattern string with one of the ++ when PCRE is built; if this is the case, the other behaviour can be ++ requested via the PCRE_BSR_UNICODE option. It is also possible to ++ specify these settings by starting a pattern string with one of the + following sequences: + + (*BSR_ANYCRLF) CR, LF, or CRLF only + (*BSR_UNICODE) any Unicode newline sequence + + These override the default and the options given to the compiling func- +- tion, but they can themselves be overridden by options given to a +- matching function. Note that these special settings, which are not +- Perl-compatible, are recognized only at the very start of a pattern, +- and that they must be in upper case. If more than one of them is +- present, the last one is used. They can be combined with a change of ++ tion, but they can themselves be overridden by options given to a ++ matching function. Note that these special settings, which are not ++ Perl-compatible, are recognized only at the very start of a pattern, ++ and that they must be in upper case. If more than one of them is ++ present, the last one is used. They can be combined with a change of + newline convention; for example, a pattern can start with: + + (*ANY)(*BSR_ANYCRLF) + +- They can also be combined with the (*UTF8), (*UTF16), (*UTF32), (*UTF) ++ They can also be combined with the (*UTF8), (*UTF16), (*UTF32), (*UTF) + or (*UCP) special sequences. Inside a character class, \R is treated as +- an unrecognized escape sequence, and so matches the letter "R" by ++ an unrecognized escape sequence, and so matches the letter "R" by + default, but causes an error if PCRE_EXTRA is set. + + Unicode character properties + + When PCRE is built with Unicode character property support, three addi- +- tional escape sequences that match characters with specific properties +- are available. When in 8-bit non-UTF-8 mode, these sequences are of +- course limited to testing characters whose codepoints are less than ++ tional escape sequences that match characters with specific properties ++ are available. When in 8-bit non-UTF-8 mode, these sequences are of ++ course limited to testing characters whose codepoints are less than + 256, but they do work in this mode. The extra escape sequences are: + + \p{xx} a character with the xx property + \P{xx} a character without the xx property + \X a Unicode extended grapheme cluster + +- The property names represented by xx above are limited to the Unicode ++ The property names represented by xx above are limited to the Unicode + script names, the general category properties, "Any", which matches any +- character (including newline), and some special PCRE properties +- (described in the next section). Other Perl properties such as "InMu- +- sicalSymbols" are not currently supported by PCRE. Note that \P{Any} ++ character (including newline), and some special PCRE properties ++ (described in the next section). Other Perl properties such as "InMu- ++ sicalSymbols" are not currently supported by PCRE. Note that \P{Any} + does not match any characters, so always causes a match failure. + + Sets of Unicode characters are defined as belonging to certain scripts. +- A character from one of these sets can be matched using a script name. ++ A character from one of these sets can be matched using a script name. + For example: + + \p{Greek} + \P{Han} + +- Those that are not part of an identified script are lumped together as ++ Those that are not part of an identified script are lumped together as + "Common". The current list of scripts is: + +- Arabic, Armenian, Avestan, Balinese, Bamum, Bassa_Vah, Batak, Bengali, +- Bopomofo, Brahmi, Braille, Buginese, Buhid, Canadian_Aboriginal, Car- ++ Arabic, Armenian, Avestan, Balinese, Bamum, Bassa_Vah, Batak, Bengali, ++ Bopomofo, Brahmi, Braille, Buginese, Buhid, Canadian_Aboriginal, Car- + ian, Caucasian_Albanian, Chakma, Cham, Cherokee, Common, Coptic, Cunei- + form, Cypriot, Cyrillic, Deseret, Devanagari, Duployan, Egyptian_Hiero- + glyphs, Elbasan, Ethiopic, Georgian, Glagolitic, Gothic, Grantha, +- Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, Hiragana, +- Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip- +- tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li, +- Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Latin, Lepcha, Limbu, Lin- +- ear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani, Malayalam, Mandaic, +- Manichaean, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive, +- Meroitic_Hieroglyphs, Miao, Modi, Mongolian, Mro, Myanmar, Nabataean, +- New_Tai_Lue, Nko, Ogham, Ol_Chiki, Old_Italic, Old_North_Arabian, ++ Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, Hiragana, ++ Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip- ++ tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li, ++ Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Latin, Lepcha, Limbu, Lin- ++ ear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani, Malayalam, Mandaic, ++ Manichaean, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive, ++ Meroitic_Hieroglyphs, Miao, Modi, Mongolian, Mro, Myanmar, Nabataean, ++ New_Tai_Lue, Nko, Ogham, Ol_Chiki, Old_Italic, Old_North_Arabian, + Old_Permic, Old_Persian, Old_South_Arabian, Old_Turkic, Oriya, Osmanya, + Pahawh_Hmong, Palmyrene, Pau_Cin_Hau, Phags_Pa, Phoenician, +- Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra, Sharada, Sha- +- vian, Siddham, Sinhala, Sora_Sompeng, Sundanese, Syloti_Nagri, Syriac, +- Tagalog, Tagbanwa, Tai_Le, Tai_Tham, Tai_Viet, Takri, Tamil, Telugu, +- Thaana, Thai, Tibetan, Tifinagh, Tirhuta, Ugaritic, Vai, Warang_Citi, ++ Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra, Sharada, Sha- ++ vian, Siddham, Sinhala, Sora_Sompeng, Sundanese, Syloti_Nagri, Syriac, ++ Tagalog, Tagbanwa, Tai_Le, Tai_Tham, Tai_Viet, Takri, Tamil, Telugu, ++ Thaana, Thai, Tibetan, Tifinagh, Tirhuta, Ugaritic, Vai, Warang_Citi, + Yi. + + Each character has exactly one Unicode general category property, spec- +- ified by a two-letter abbreviation. For compatibility with Perl, nega- +- tion can be specified by including a circumflex between the opening +- brace and the property name. For example, \p{^Lu} is the same as ++ ified by a two-letter abbreviation. For compatibility with Perl, nega- ++ tion can be specified by including a circumflex between the opening ++ brace and the property name. For example, \p{^Lu} is the same as + \P{Lu}. + + If only one letter is specified with \p or \P, it includes all the gen- +- eral category properties that start with that letter. In this case, in +- the absence of negation, the curly brackets in the escape sequence are ++ eral category properties that start with that letter. In this case, in ++ the absence of negation, the curly brackets in the escape sequence are + optional; these two examples have the same effect: + + \p{L} +@@ -5407,73 +5431,73 @@ BACKSLASH + Zp Paragraph separator + Zs Space separator + +- The special property L& is also supported: it matches a character that +- has the Lu, Ll, or Lt property, in other words, a letter that is not ++ The special property L& is also supported: it matches a character that ++ has the Lu, Ll, or Lt property, in other words, a letter that is not + classified as a modifier or "other". + +- The Cs (Surrogate) property applies only to characters in the range +- U+D800 to U+DFFF. Such characters are not valid in Unicode strings and +- so cannot be tested by PCRE, unless UTF validity checking has been ++ The Cs (Surrogate) property applies only to characters in the range ++ U+D800 to U+DFFF. Such characters are not valid in Unicode strings and ++ so cannot be tested by PCRE, unless UTF validity checking has been + turned off (see the discussion of PCRE_NO_UTF8_CHECK, +- PCRE_NO_UTF16_CHECK and PCRE_NO_UTF32_CHECK in the pcreapi page). Perl ++ PCRE_NO_UTF16_CHECK and PCRE_NO_UTF32_CHECK in the pcreapi page). Perl + does not support the Cs property. + +- The long synonyms for property names that Perl supports (such as +- \p{Letter}) are not supported by PCRE, nor is it permitted to prefix ++ The long synonyms for property names that Perl supports (such as ++ \p{Letter}) are not supported by PCRE, nor is it permitted to prefix + any of these properties with "Is". + + No character that is in the Unicode table has the Cn (unassigned) prop- + erty. Instead, this property is assumed for any code point that is not + in the Unicode table. + +- Specifying caseless matching does not affect these escape sequences. +- For example, \p{Lu} always matches only upper case letters. This is ++ Specifying caseless matching does not affect these escape sequences. ++ For example, \p{Lu} always matches only upper case letters. This is + different from the behaviour of current versions of Perl. + +- Matching characters by Unicode property is not fast, because PCRE has +- to do a multistage table lookup in order to find a character's prop- ++ Matching characters by Unicode property is not fast, because PCRE has ++ to do a multistage table lookup in order to find a character's prop- + erty. That is why the traditional escape sequences such as \d and \w do + not use Unicode properties in PCRE by default, though you can make them +- do so by setting the PCRE_UCP option or by starting the pattern with ++ do so by setting the PCRE_UCP option or by starting the pattern with + (*UCP). + + Extended grapheme clusters + +- The \X escape matches any number of Unicode characters that form an ++ The \X escape matches any number of Unicode characters that form an + "extended grapheme cluster", and treats the sequence as an atomic group +- (see below). Up to and including release 8.31, PCRE matched an ear- ++ (see below). Up to and including release 8.31, PCRE matched an ear- + lier, simpler definition that was equivalent to + + (?>\PM\pM*) + +- That is, it matched a character without the "mark" property, followed +- by zero or more characters with the "mark" property. Characters with +- the "mark" property are typically non-spacing accents that affect the ++ That is, it matched a character without the "mark" property, followed ++ by zero or more characters with the "mark" property. Characters with ++ the "mark" property are typically non-spacing accents that affect the + preceding character. + +- This simple definition was extended in Unicode to include more compli- +- cated kinds of composite character by giving each character a grapheme +- breaking property, and creating rules that use these properties to +- define the boundaries of extended grapheme clusters. In releases of ++ This simple definition was extended in Unicode to include more compli- ++ cated kinds of composite character by giving each character a grapheme ++ breaking property, and creating rules that use these properties to ++ define the boundaries of extended grapheme clusters. In releases of + PCRE later than 8.31, \X matches one of these clusters. + +- \X always matches at least one character. Then it decides whether to ++ \X always matches at least one character. Then it decides whether to + add additional characters according to the following rules for ending a + cluster: + + 1. End at the end of the subject string. + +- 2. Do not end between CR and LF; otherwise end after any control char- ++ 2. Do not end between CR and LF; otherwise end after any control char- + acter. + +- 3. Do not break Hangul (a Korean script) syllable sequences. Hangul +- characters are of five types: L, V, T, LV, and LVT. An L character may +- be followed by an L, V, LV, or LVT character; an LV or V character may ++ 3. Do not break Hangul (a Korean script) syllable sequences. Hangul ++ characters are of five types: L, V, T, LV, and LVT. An L character may ++ be followed by an L, V, LV, or LVT character; an LV or V character may + be followed by a V or T character; an LVT or T character may be follwed + only by a T character. + +- 4. Do not end before extending characters or spacing marks. Characters +- with the "mark" property always have the "extend" grapheme breaking ++ 4. Do not end before extending characters or spacing marks. Characters ++ with the "mark" property always have the "extend" grapheme breaking + property. + + 5. Do not end after prepend characters. +@@ -5482,9 +5506,9 @@ BACKSLASH + + PCRE's additional properties + +- As well as the standard Unicode properties described above, PCRE sup- +- ports four more that make it possible to convert traditional escape +- sequences such as \w and \s to use Unicode properties. PCRE uses these ++ As well as the standard Unicode properties described above, PCRE sup- ++ ports four more that make it possible to convert traditional escape ++ sequences such as \w and \s to use Unicode properties. PCRE uses these + non-standard, non-Perl properties internally when PCRE_UCP is set. How- + ever, they may also be used explicitly. These properties are: + +@@ -5493,54 +5517,54 @@ BACKSLASH + Xsp Any Perl space character + Xwd Any Perl "word" character + +- Xan matches characters that have either the L (letter) or the N (num- +- ber) property. Xps matches the characters tab, linefeed, vertical tab, +- form feed, or carriage return, and any other character that has the Z +- (separator) property. Xsp is the same as Xps; it used to exclude ver- +- tical tab, for Perl compatibility, but Perl changed, and so PCRE fol- +- lowed at release 8.34. Xwd matches the same characters as Xan, plus ++ Xan matches characters that have either the L (letter) or the N (num- ++ ber) property. Xps matches the characters tab, linefeed, vertical tab, ++ form feed, or carriage return, and any other character that has the Z ++ (separator) property. Xsp is the same as Xps; it used to exclude ver- ++ tical tab, for Perl compatibility, but Perl changed, and so PCRE fol- ++ lowed at release 8.34. Xwd matches the same characters as Xan, plus + underscore. + +- There is another non-standard property, Xuc, which matches any charac- +- ter that can be represented by a Universal Character Name in C++ and +- other programming languages. These are the characters $, @, ` (grave +- accent), and all characters with Unicode code points greater than or +- equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that +- most base (ASCII) characters are excluded. (Universal Character Names +- are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit. ++ There is another non-standard property, Xuc, which matches any charac- ++ ter that can be represented by a Universal Character Name in C++ and ++ other programming languages. These are the characters $, @, ` (grave ++ accent), and all characters with Unicode code points greater than or ++ equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that ++ most base (ASCII) characters are excluded. (Universal Character Names ++ are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit. + Note that the Xuc property does not match these sequences but the char- + acters that they represent.) + + Resetting the match start + +- The escape sequence \K causes any previously matched characters not to ++ The escape sequence \K causes any previously matched characters not to + be included in the final matched sequence. For example, the pattern: + + foo\Kbar + +- matches "foobar", but reports that it has matched "bar". This feature +- is similar to a lookbehind assertion (described below). However, in +- this case, the part of the subject before the real match does not have +- to be of fixed length, as lookbehind assertions do. The use of \K does +- not interfere with the setting of captured substrings. For example, ++ matches "foobar", but reports that it has matched "bar". This feature ++ is similar to a lookbehind assertion (described below). However, in ++ this case, the part of the subject before the real match does not have ++ to be of fixed length, as lookbehind assertions do. The use of \K does ++ not interfere with the setting of captured substrings. For example, + when the pattern + + (foo)\Kbar + + matches "foobar", the first substring is still set to "foo". + +- Perl documents that the use of \K within assertions is "not well +- defined". In PCRE, \K is acted upon when it occurs inside positive +- assertions, but is ignored in negative assertions. Note that when a +- pattern such as (?=ab\K) matches, the reported start of the match can ++ Perl documents that the use of \K within assertions is "not well ++ defined". In PCRE, \K is acted upon when it occurs inside positive ++ assertions, but is ignored in negative assertions. Note that when a ++ pattern such as (?=ab\K) matches, the reported start of the match can + be greater than the end of the match. + + Simple assertions + +- The final use of backslash is for certain simple assertions. An asser- +- tion specifies a condition that has to be met at a particular point in +- a match, without consuming any characters from the subject string. The +- use of subpatterns for more complicated assertions is described below. ++ The final use of backslash is for certain simple assertions. An asser- ++ tion specifies a condition that has to be met at a particular point in ++ a match, without consuming any characters from the subject string. The ++ use of subpatterns for more complicated assertions is described below. + The backslashed assertions are: + + \b matches at a word boundary +@@ -5551,161 +5575,161 @@ BACKSLASH + \z matches only at the end of the subject + \G matches at the first matching position in the subject + +- Inside a character class, \b has a different meaning; it matches the +- backspace character. If any other of these assertions appears in a +- character class, by default it matches the corresponding literal char- ++ Inside a character class, \b has a different meaning; it matches the ++ backspace character. If any other of these assertions appears in a ++ character class, by default it matches the corresponding literal char- + acter (for example, \B matches the letter B). However, if the +- PCRE_EXTRA option is set, an "invalid escape sequence" error is gener- ++ PCRE_EXTRA option is set, an "invalid escape sequence" error is gener- + ated instead. + +- A word boundary is a position in the subject string where the current +- character and the previous character do not both match \w or \W (i.e. +- one matches \w and the other matches \W), or the start or end of the +- string if the first or last character matches \w, respectively. In a +- UTF mode, the meanings of \w and \W can be changed by setting the +- PCRE_UCP option. When this is done, it also affects \b and \B. Neither +- PCRE nor Perl has a separate "start of word" or "end of word" metase- +- quence. However, whatever follows \b normally determines which it is. ++ A word boundary is a position in the subject string where the current ++ character and the previous character do not both match \w or \W (i.e. ++ one matches \w and the other matches \W), or the start or end of the ++ string if the first or last character matches \w, respectively. In a ++ UTF mode, the meanings of \w and \W can be changed by setting the ++ PCRE_UCP option. When this is done, it also affects \b and \B. Neither ++ PCRE nor Perl has a separate "start of word" or "end of word" metase- ++ quence. However, whatever follows \b normally determines which it is. + For example, the fragment \ba matches "a" at the start of a word. + +- The \A, \Z, and \z assertions differ from the traditional circumflex ++ The \A, \Z, and \z assertions differ from the traditional circumflex + and dollar (described in the next section) in that they only ever match +- at the very start and end of the subject string, whatever options are +- set. Thus, they are independent of multiline mode. These three asser- ++ at the very start and end of the subject string, whatever options are ++ set. Thus, they are independent of multiline mode. These three asser- + tions are not affected by the PCRE_NOTBOL or PCRE_NOTEOL options, which +- affect only the behaviour of the circumflex and dollar metacharacters. +- However, if the startoffset argument of pcre_exec() is non-zero, indi- ++ affect only the behaviour of the circumflex and dollar metacharacters. ++ However, if the startoffset argument of pcre_exec() is non-zero, indi- + cating that matching is to start at a point other than the beginning of +- the subject, \A can never match. The difference between \Z and \z is ++ the subject, \A can never match. The difference between \Z and \z is + that \Z matches before a newline at the end of the string as well as at + the very end, whereas \z matches only at the end. + +- The \G assertion is true only when the current matching position is at +- the start point of the match, as specified by the startoffset argument +- of pcre_exec(). It differs from \A when the value of startoffset is +- non-zero. By calling pcre_exec() multiple times with appropriate argu- ++ The \G assertion is true only when the current matching position is at ++ the start point of the match, as specified by the startoffset argument ++ of pcre_exec(). It differs from \A when the value of startoffset is ++ non-zero. By calling pcre_exec() multiple times with appropriate argu- + ments, you can mimic Perl's /g option, and it is in this kind of imple- + mentation where \G can be useful. + +- Note, however, that PCRE's interpretation of \G, as the start of the ++ Note, however, that PCRE's interpretation of \G, as the start of the + current match, is subtly different from Perl's, which defines it as the +- end of the previous match. In Perl, these can be different when the +- previously matched string was empty. Because PCRE does just one match ++ end of the previous match. In Perl, these can be different when the ++ previously matched string was empty. Because PCRE does just one match + at a time, it cannot reproduce this behaviour. + +- If all the alternatives of a pattern begin with \G, the expression is ++ If all the alternatives of a pattern begin with \G, the expression is + anchored to the starting match position, and the "anchored" flag is set + in the compiled regular expression. + + + CIRCUMFLEX AND DOLLAR + +- The circumflex and dollar metacharacters are zero-width assertions. +- That is, they test for a particular condition being true without con- ++ The circumflex and dollar metacharacters are zero-width assertions. ++ That is, they test for a particular condition being true without con- + suming any characters from the subject string. + + Outside a character class, in the default matching mode, the circumflex +- character is an assertion that is true only if the current matching +- point is at the start of the subject string. If the startoffset argu- +- ment of pcre_exec() is non-zero, circumflex can never match if the +- PCRE_MULTILINE option is unset. Inside a character class, circumflex ++ character is an assertion that is true only if the current matching ++ point is at the start of the subject string. If the startoffset argu- ++ ment of pcre_exec() is non-zero, circumflex can never match if the ++ PCRE_MULTILINE option is unset. Inside a character class, circumflex + has an entirely different meaning (see below). + +- Circumflex need not be the first character of the pattern if a number +- of alternatives are involved, but it should be the first thing in each +- alternative in which it appears if the pattern is ever to match that +- branch. If all possible alternatives start with a circumflex, that is, +- if the pattern is constrained to match only at the start of the sub- +- ject, it is said to be an "anchored" pattern. (There are also other ++ Circumflex need not be the first character of the pattern if a number ++ of alternatives are involved, but it should be the first thing in each ++ alternative in which it appears if the pattern is ever to match that ++ branch. If all possible alternatives start with a circumflex, that is, ++ if the pattern is constrained to match only at the start of the sub- ++ ject, it is said to be an "anchored" pattern. (There are also other + constructs that can cause a pattern to be anchored.) + +- The dollar character is an assertion that is true only if the current +- matching point is at the end of the subject string, or immediately +- before a newline at the end of the string (by default). Note, however, +- that it does not actually match the newline. Dollar need not be the ++ The dollar character is an assertion that is true only if the current ++ matching point is at the end of the subject string, or immediately ++ before a newline at the end of the string (by default). Note, however, ++ that it does not actually match the newline. Dollar need not be the + last character of the pattern if a number of alternatives are involved, +- but it should be the last item in any branch in which it appears. Dol- ++ but it should be the last item in any branch in which it appears. Dol- + lar has no special meaning in a character class. + +- The meaning of dollar can be changed so that it matches only at the +- very end of the string, by setting the PCRE_DOLLAR_ENDONLY option at ++ The meaning of dollar can be changed so that it matches only at the ++ very end of the string, by setting the PCRE_DOLLAR_ENDONLY option at + compile time. This does not affect the \Z assertion. + + The meanings of the circumflex and dollar characters are changed if the +- PCRE_MULTILINE option is set. When this is the case, a circumflex +- matches immediately after internal newlines as well as at the start of +- the subject string. It does not match after a newline that ends the +- string. A dollar matches before any newlines in the string, as well as +- at the very end, when PCRE_MULTILINE is set. When newline is specified +- as the two-character sequence CRLF, isolated CR and LF characters do ++ PCRE_MULTILINE option is set. When this is the case, a circumflex ++ matches immediately after internal newlines as well as at the start of ++ the subject string. It does not match after a newline that ends the ++ string. A dollar matches before any newlines in the string, as well as ++ at the very end, when PCRE_MULTILINE is set. When newline is specified ++ as the two-character sequence CRLF, isolated CR and LF characters do + not indicate newlines. + +- For example, the pattern /^abc$/ matches the subject string "def\nabc" +- (where \n represents a newline) in multiline mode, but not otherwise. +- Consequently, patterns that are anchored in single line mode because +- all branches start with ^ are not anchored in multiline mode, and a +- match for circumflex is possible when the startoffset argument of +- pcre_exec() is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if ++ For example, the pattern /^abc$/ matches the subject string "def\nabc" ++ (where \n represents a newline) in multiline mode, but not otherwise. ++ Consequently, patterns that are anchored in single line mode because ++ all branches start with ^ are not anchored in multiline mode, and a ++ match for circumflex is possible when the startoffset argument of ++ pcre_exec() is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if + PCRE_MULTILINE is set. + +- Note that the sequences \A, \Z, and \z can be used to match the start +- and end of the subject in both modes, and if all branches of a pattern +- start with \A it is always anchored, whether or not PCRE_MULTILINE is ++ Note that the sequences \A, \Z, and \z can be used to match the start ++ and end of the subject in both modes, and if all branches of a pattern ++ start with \A it is always anchored, whether or not PCRE_MULTILINE is + set. + + + FULL STOP (PERIOD, DOT) AND \N + + Outside a character class, a dot in the pattern matches any one charac- +- ter in the subject string except (by default) a character that signi- ++ ter in the subject string except (by default) a character that signi- + fies the end of a line. + +- When a line ending is defined as a single character, dot never matches +- that character; when the two-character sequence CRLF is used, dot does +- not match CR if it is immediately followed by LF, but otherwise it +- matches all characters (including isolated CRs and LFs). When any Uni- +- code line endings are being recognized, dot does not match CR or LF or ++ When a line ending is defined as a single character, dot never matches ++ that character; when the two-character sequence CRLF is used, dot does ++ not match CR if it is immediately followed by LF, but otherwise it ++ matches all characters (including isolated CRs and LFs). When any Uni- ++ code line endings are being recognized, dot does not match CR or LF or + any of the other line ending characters. + +- The behaviour of dot with regard to newlines can be changed. If the +- PCRE_DOTALL option is set, a dot matches any one character, without ++ The behaviour of dot with regard to newlines can be changed. If the ++ PCRE_DOTALL option is set, a dot matches any one character, without + exception. If the two-character sequence CRLF is present in the subject + string, it takes two dots to match it. + +- The handling of dot is entirely independent of the handling of circum- +- flex and dollar, the only relationship being that they both involve ++ The handling of dot is entirely independent of the handling of circum- ++ flex and dollar, the only relationship being that they both involve + newlines. Dot has no special meaning in a character class. + +- The escape sequence \N behaves like a dot, except that it is not +- affected by the PCRE_DOTALL option. In other words, it matches any +- character except one that signifies the end of a line. Perl also uses ++ The escape sequence \N behaves like a dot, except that it is not ++ affected by the PCRE_DOTALL option. In other words, it matches any ++ character except one that signifies the end of a line. Perl also uses + \N to match characters by name; PCRE does not support this. + + + MATCHING A SINGLE DATA UNIT + +- Outside a character class, the escape sequence \C matches any one data +- unit, whether or not a UTF mode is set. In the 8-bit library, one data +- unit is one byte; in the 16-bit library it is a 16-bit unit; in the +- 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches +- line-ending characters. The feature is provided in Perl in order to ++ Outside a character class, the escape sequence \C matches any one data ++ unit, whether or not a UTF mode is set. In the 8-bit library, one data ++ unit is one byte; in the 16-bit library it is a 16-bit unit; in the ++ 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches ++ line-ending characters. The feature is provided in Perl in order to + match individual bytes in UTF-8 mode, but it is unclear how it can use- +- fully be used. Because \C breaks up characters into individual data +- units, matching one unit with \C in a UTF mode means that the rest of ++ fully be used. Because \C breaks up characters into individual data ++ units, matching one unit with \C in a UTF mode means that the rest of + the string may start with a malformed UTF character. This has undefined + results, because PCRE assumes that it is dealing with valid UTF strings +- (and by default it checks this at the start of processing unless the +- PCRE_NO_UTF8_CHECK, PCRE_NO_UTF16_CHECK or PCRE_NO_UTF32_CHECK option ++ (and by default it checks this at the start of processing unless the ++ PCRE_NO_UTF8_CHECK, PCRE_NO_UTF16_CHECK or PCRE_NO_UTF32_CHECK option + is used). + +- PCRE does not allow \C to appear in lookbehind assertions (described +- below) in a UTF mode, because this would make it impossible to calcu- ++ PCRE does not allow \C to appear in lookbehind assertions (described ++ below) in a UTF mode, because this would make it impossible to calcu- + late the length of the lookbehind. + + In general, the \C escape sequence is best avoided. However, one way of +- using it that avoids the problem of malformed UTF characters is to use +- a lookahead to check the length of the next character, as in this pat- +- tern, which could be used with a UTF-8 string (ignore white space and ++ using it that avoids the problem of malformed UTF characters is to use ++ a lookahead to check the length of the next character, as in this pat- ++ tern, which could be used with a UTF-8 string (ignore white space and + line breaks): + + (?| (?=[\x00-\x7f])(\C) | +@@ -5713,11 +5737,11 @@ MATCHING A SINGLE DATA UNIT + (?=[\x{800}-\x{ffff}])(\C)(\C)(\C) | + (?=[\x{10000}-\x{1fffff}])(\C)(\C)(\C)(\C)) + +- A group that starts with (?| resets the capturing parentheses numbers +- in each alternative (see "Duplicate Subpattern Numbers" below). The +- assertions at the start of each branch check the next UTF-8 character +- for values whose encoding uses 1, 2, 3, or 4 bytes, respectively. The +- character's individual bytes are then captured by the appropriate num- ++ A group that starts with (?| resets the capturing parentheses numbers ++ in each alternative (see "Duplicate Subpattern Numbers" below). The ++ assertions at the start of each branch check the next UTF-8 character ++ for values whose encoding uses 1, 2, 3, or 4 bytes, respectively. The ++ character's individual bytes are then captured by the appropriate num- + ber of groups. + + +@@ -5727,109 +5751,109 @@ SQUARE BRACKETS AND CHARACTER CLASSES + closing square bracket. A closing square bracket on its own is not spe- + cial by default. However, if the PCRE_JAVASCRIPT_COMPAT option is set, + a lone closing square bracket causes a compile-time error. If a closing +- square bracket is required as a member of the class, it should be the +- first data character in the class (after an initial circumflex, if ++ square bracket is required as a member of the class, it should be the ++ first data character in the class (after an initial circumflex, if + present) or escaped with a backslash. + +- A character class matches a single character in the subject. In a UTF +- mode, the character may be more than one data unit long. A matched ++ A character class matches a single character in the subject. In a UTF ++ mode, the character may be more than one data unit long. A matched + character must be in the set of characters defined by the class, unless +- the first character in the class definition is a circumflex, in which ++ the first character in the class definition is a circumflex, in which + case the subject character must not be in the set defined by the class. +- If a circumflex is actually required as a member of the class, ensure ++ If a circumflex is actually required as a member of the class, ensure + it is not the first character, or escape it with a backslash. + +- For example, the character class [aeiou] matches any lower case vowel, +- while [^aeiou] matches any character that is not a lower case vowel. ++ For example, the character class [aeiou] matches any lower case vowel, ++ while [^aeiou] matches any character that is not a lower case vowel. + Note that a circumflex is just a convenient notation for specifying the +- characters that are in the class by enumerating those that are not. A +- class that starts with a circumflex is not an assertion; it still con- +- sumes a character from the subject string, and therefore it fails if ++ characters that are in the class by enumerating those that are not. A ++ class that starts with a circumflex is not an assertion; it still con- ++ sumes a character from the subject string, and therefore it fails if + the current pointer is at the end of the string. + + In UTF-8 (UTF-16, UTF-32) mode, characters with values greater than 255 +- (0xffff) can be included in a class as a literal string of data units, ++ (0xffff) can be included in a class as a literal string of data units, + or by using the \x{ escaping mechanism. + +- When caseless matching is set, any letters in a class represent both +- their upper case and lower case versions, so for example, a caseless +- [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not +- match "A", whereas a caseful version would. In a UTF mode, PCRE always +- understands the concept of case for characters whose values are less +- than 128, so caseless matching is always possible. For characters with +- higher values, the concept of case is supported if PCRE is compiled +- with Unicode property support, but not otherwise. If you want to use +- caseless matching in a UTF mode for characters 128 and above, you must +- ensure that PCRE is compiled with Unicode property support as well as ++ When caseless matching is set, any letters in a class represent both ++ their upper case and lower case versions, so for example, a caseless ++ [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not ++ match "A", whereas a caseful version would. In a UTF mode, PCRE always ++ understands the concept of case for characters whose values are less ++ than 128, so caseless matching is always possible. For characters with ++ higher values, the concept of case is supported if PCRE is compiled ++ with Unicode property support, but not otherwise. If you want to use ++ caseless matching in a UTF mode for characters 128 and above, you must ++ ensure that PCRE is compiled with Unicode property support as well as + with UTF support. + +- Characters that might indicate line breaks are never treated in any +- special way when matching character classes, whatever line-ending +- sequence is in use, and whatever setting of the PCRE_DOTALL and ++ Characters that might indicate line breaks are never treated in any ++ special way when matching character classes, whatever line-ending ++ sequence is in use, and whatever setting of the PCRE_DOTALL and + PCRE_MULTILINE options is used. A class such as [^a] always matches one + of these characters. + +- The minus (hyphen) character can be used to specify a range of charac- +- ters in a character class. For example, [d-m] matches any letter +- between d and m, inclusive. If a minus character is required in a +- class, it must be escaped with a backslash or appear in a position +- where it cannot be interpreted as indicating a range, typically as the ++ The minus (hyphen) character can be used to specify a range of charac- ++ ters in a character class. For example, [d-m] matches any letter ++ between d and m, inclusive. If a minus character is required in a ++ class, it must be escaped with a backslash or appear in a position ++ where it cannot be interpreted as indicating a range, typically as the + first or last character in the class, or immediately after a range. For +- example, [b-d-z] matches letters in the range b to d, a hyphen charac- ++ example, [b-d-z] matches letters in the range b to d, a hyphen charac- + ter, or z. + + It is not possible to have the literal character "]" as the end charac- +- ter of a range. A pattern such as [W-]46] is interpreted as a class of +- two characters ("W" and "-") followed by a literal string "46]", so it +- would match "W46]" or "-46]". However, if the "]" is escaped with a +- backslash it is interpreted as the end of range, so [W-\]46] is inter- +- preted as a class containing a range followed by two other characters. +- The octal or hexadecimal representation of "]" can also be used to end ++ ter of a range. A pattern such as [W-]46] is interpreted as a class of ++ two characters ("W" and "-") followed by a literal string "46]", so it ++ would match "W46]" or "-46]". However, if the "]" is escaped with a ++ backslash it is interpreted as the end of range, so [W-\]46] is inter- ++ preted as a class containing a range followed by two other characters. ++ The octal or hexadecimal representation of "]" can also be used to end + a range. + +- An error is generated if a POSIX character class (see below) or an +- escape sequence other than one that defines a single character appears +- at a point where a range ending character is expected. For example, ++ An error is generated if a POSIX character class (see below) or an ++ escape sequence other than one that defines a single character appears ++ at a point where a range ending character is expected. For example, + [z-\xff] is valid, but [A-\d] and [A-[:digit:]] are not. + +- Ranges operate in the collating sequence of character values. They can +- also be used for characters specified numerically, for example +- [\000-\037]. Ranges can include any characters that are valid for the ++ Ranges operate in the collating sequence of character values. They can ++ also be used for characters specified numerically, for example ++ [\000-\037]. Ranges can include any characters that are valid for the + current mode. + + If a range that includes letters is used when caseless matching is set, + it matches the letters in either case. For example, [W-c] is equivalent +- to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if +- character tables for a French locale are in use, [\xc8-\xcb] matches +- accented E characters in both cases. In UTF modes, PCRE supports the +- concept of case for characters with values greater than 128 only when ++ to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if ++ character tables for a French locale are in use, [\xc8-\xcb] matches ++ accented E characters in both cases. In UTF modes, PCRE supports the ++ concept of case for characters with values greater than 128 only when + it is compiled with Unicode property support. + +- The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v, \V, ++ The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v, \V, + \w, and \W may appear in a character class, and add the characters that +- they match to the class. For example, [\dABCDEF] matches any hexadeci- +- mal digit. In UTF modes, the PCRE_UCP option affects the meanings of +- \d, \s, \w and their upper case partners, just as it does when they +- appear outside a character class, as described in the section entitled ++ they match to the class. For example, [\dABCDEF] matches any hexadeci- ++ mal digit. In UTF modes, the PCRE_UCP option affects the meanings of ++ \d, \s, \w and their upper case partners, just as it does when they ++ appear outside a character class, as described in the section entitled + "Generic character types" above. The escape sequence \b has a different +- meaning inside a character class; it matches the backspace character. +- The sequences \B, \N, \R, and \X are not special inside a character +- class. Like any other unrecognized escape sequences, they are treated +- as the literal characters "B", "N", "R", and "X" by default, but cause ++ meaning inside a character class; it matches the backspace character. ++ The sequences \B, \N, \R, and \X are not special inside a character ++ class. Like any other unrecognized escape sequences, they are treated ++ as the literal characters "B", "N", "R", and "X" by default, but cause + an error if the PCRE_EXTRA option is set. + +- A circumflex can conveniently be used with the upper case character +- types to specify a more restricted set of characters than the matching +- lower case type. For example, the class [^\W_] matches any letter or ++ A circumflex can conveniently be used with the upper case character ++ types to specify a more restricted set of characters than the matching ++ lower case type. For example, the class [^\W_] matches any letter or + digit, but not underscore, whereas [\w] includes underscore. A positive + character class should be read as "something OR something OR ..." and a + negative class as "NOT something AND NOT something AND NOT ...". + +- The only metacharacters that are recognized in character classes are +- backslash, hyphen (only where it can be interpreted as specifying a +- range), circumflex (only at the start), opening square bracket (only +- when it can be interpreted as introducing a POSIX class name, or for a +- special compatibility feature - see the next two sections), and the ++ The only metacharacters that are recognized in character classes are ++ backslash, hyphen (only where it can be interpreted as specifying a ++ range), circumflex (only at the start), opening square bracket (only ++ when it can be interpreted as introducing a POSIX class name, or for a ++ special compatibility feature - see the next two sections), and the + terminating closing square bracket. However, escaping other non- + alphanumeric characters does no harm. + +@@ -5837,7 +5861,7 @@ SQUARE BRACKETS AND CHARACTER CLASSES + POSIX CHARACTER CLASSES + + Perl supports the POSIX notation for character classes. This uses names +- enclosed by [: and :] within the enclosing square brackets. PCRE also ++ enclosed by [: and :] within the enclosing square brackets. PCRE also + supports this notation. For example, + + [01[:alpha:]%] +@@ -5860,28 +5884,28 @@ POSIX CHARACTER CLASSES + word "word" characters (same as \w) + xdigit hexadecimal digits + +- The default "space" characters are HT (9), LF (10), VT (11), FF (12), +- CR (13), and space (32). If locale-specific matching is taking place, +- the list of space characters may be different; there may be fewer or ++ The default "space" characters are HT (9), LF (10), VT (11), FF (12), ++ CR (13), and space (32). If locale-specific matching is taking place, ++ the list of space characters may be different; there may be fewer or + more of them. "Space" used to be different to \s, which did not include + VT, for Perl compatibility. However, Perl changed at release 5.18, and +- PCRE followed at release 8.34. "Space" and \s now match the same set ++ PCRE followed at release 8.34. "Space" and \s now match the same set + of characters. + +- The name "word" is a Perl extension, and "blank" is a GNU extension +- from Perl 5.8. Another Perl extension is negation, which is indicated ++ The name "word" is a Perl extension, and "blank" is a GNU extension ++ from Perl 5.8. Another Perl extension is negation, which is indicated + by a ^ character after the colon. For example, + + [12[:^digit:]] + +- matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the ++ matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the + POSIX syntax [.ch.] and [=ch=] where "ch" is a "collating element", but + these are not supported, and an error is given if they are encountered. + + By default, characters with values greater than 128 do not match any of +- the POSIX character classes. However, if the PCRE_UCP option is passed +- to pcre_compile(), some of the classes are changed so that Unicode +- character properties are used. This is achieved by replacing certain ++ the POSIX character classes. However, if the PCRE_UCP option is passed ++ to pcre_compile(), some of the classes are changed so that Unicode ++ character properties are used. This is achieved by replacing certain + POSIX classes by other sequences, as follows: + + [:alnum:] becomes \p{Xan} +@@ -5893,10 +5917,10 @@ POSIX CHARACTER CLASSES + [:upper:] becomes \p{Lu} + [:word:] becomes \p{Xwd} + +- Negated versions, such as [:^alpha:] use \P instead of \p. Three other ++ Negated versions, such as [:^alpha:] use \P instead of \p. Three other + POSIX classes are handled specially in UCP mode: + +- [:graph:] This matches characters that have glyphs that mark the page ++ [:graph:] This matches characters that have glyphs that mark the page + when printed. In Unicode property terms, it matches all char- + acters with the L, M, N, P, S, or Cf properties, except for: + +@@ -5905,58 +5929,58 @@ POSIX CHARACTER CLASSES + U+2066 - U+2069 Various "isolate"s + + +- [:print:] This matches the same characters as [:graph:] plus space +- characters that are not controls, that is, characters with ++ [:print:] This matches the same characters as [:graph:] plus space ++ characters that are not controls, that is, characters with + the Zs property. + + [:punct:] This matches all characters that have the Unicode P (punctua- +- tion) property, plus those characters whose code points are ++ tion) property, plus those characters whose code points are + less than 128 that have the S (Symbol) property. + +- The other POSIX classes are unchanged, and match only characters with ++ The other POSIX classes are unchanged, and match only characters with + code points less than 128. + + + COMPATIBILITY FEATURE FOR WORD BOUNDARIES + +- In the POSIX.2 compliant library that was included in 4.4BSD Unix, the +- ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word" ++ In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ++ ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word" + and "end of word". PCRE treats these items as follows: + + [[:<:]] is converted to \b(?=\w) + [[:>:]] is converted to \b(?<=\w) + + Only these exact character sequences are recognized. A sequence such as +- [a[:<:]b] provokes error for an unrecognized POSIX class name. This +- support is not compatible with Perl. It is provided to help migrations ++ [a[:<:]b] provokes error for an unrecognized POSIX class name. This ++ support is not compatible with Perl. It is provided to help migrations + from other environments, and is best not used in any new patterns. Note +- that \b matches at the start and the end of a word (see "Simple asser- +- tions" above), and in a Perl-style pattern the preceding or following +- character normally shows which is wanted, without the need for the +- assertions that are used above in order to give exactly the POSIX be- ++ that \b matches at the start and the end of a word (see "Simple asser- ++ tions" above), and in a Perl-style pattern the preceding or following ++ character normally shows which is wanted, without the need for the ++ assertions that are used above in order to give exactly the POSIX be- + haviour. + + + VERTICAL BAR + +- Vertical bar characters are used to separate alternative patterns. For ++ Vertical bar characters are used to separate alternative patterns. For + example, the pattern + + gilbert|sullivan + +- matches either "gilbert" or "sullivan". Any number of alternatives may +- appear, and an empty alternative is permitted (matching the empty ++ matches either "gilbert" or "sullivan". Any number of alternatives may ++ appear, and an empty alternative is permitted (matching the empty + string). The matching process tries each alternative in turn, from left +- to right, and the first one that succeeds is used. If the alternatives +- are within a subpattern (defined below), "succeeds" means matching the ++ to right, and the first one that succeeds is used. If the alternatives ++ are within a subpattern (defined below), "succeeds" means matching the + rest of the main pattern as well as the alternative in the subpattern. + + + INTERNAL OPTION SETTING + +- The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and +- PCRE_EXTENDED options (which are Perl-compatible) can be changed from +- within the pattern by a sequence of Perl option letters enclosed ++ The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and ++ PCRE_EXTENDED options (which are Perl-compatible) can be changed from ++ within the pattern by a sequence of Perl option letters enclosed + between "(?" and ")". The option letters are + + i for PCRE_CASELESS +@@ -5966,51 +5990,51 @@ INTERNAL OPTION SETTING + + For example, (?im) sets caseless, multiline matching. It is also possi- + ble to unset these options by preceding the letter with a hyphen, and a +- combined setting and unsetting such as (?im-sx), which sets PCRE_CASE- +- LESS and PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED, +- is also permitted. If a letter appears both before and after the ++ combined setting and unsetting such as (?im-sx), which sets PCRE_CASE- ++ LESS and PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED, ++ is also permitted. If a letter appears both before and after the + hyphen, the option is unset. + +- The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA +- can be changed in the same way as the Perl-compatible options by using ++ The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA ++ can be changed in the same way as the Perl-compatible options by using + the characters J, U and X respectively. + +- When one of these option changes occurs at top level (that is, not +- inside subpattern parentheses), the change applies to the remainder of ++ When one of these option changes occurs at top level (that is, not ++ inside subpattern parentheses), the change applies to the remainder of + the pattern that follows. If the change is placed right at the start of + a pattern, PCRE extracts it into the global options (and it will there- + fore show up in data extracted by the pcre_fullinfo() function). + +- An option change within a subpattern (see below for a description of +- subpatterns) affects only that part of the subpattern that follows it, ++ An option change within a subpattern (see below for a description of ++ subpatterns) affects only that part of the subpattern that follows it, + so + + (a(?i)b)c + + matches abc and aBc and no other strings (assuming PCRE_CASELESS is not +- used). By this means, options can be made to have different settings +- in different parts of the pattern. Any changes made in one alternative +- do carry on into subsequent branches within the same subpattern. For ++ used). By this means, options can be made to have different settings ++ in different parts of the pattern. Any changes made in one alternative ++ do carry on into subsequent branches within the same subpattern. For + example, + + (a(?i)b|c) + +- matches "ab", "aB", "c", and "C", even though when matching "C" the +- first branch is abandoned before the option setting. This is because +- the effects of option settings happen at compile time. There would be ++ matches "ab", "aB", "c", and "C", even though when matching "C" the ++ first branch is abandoned before the option setting. This is because ++ the effects of option settings happen at compile time. There would be + some very weird behaviour otherwise. + +- Note: There are other PCRE-specific options that can be set by the +- application when the compiling or matching functions are called. In +- some cases the pattern can contain special leading sequences such as +- (*CRLF) to override what the application has set or what has been +- defaulted. Details are given in the section entitled "Newline +- sequences" above. There are also the (*UTF8), (*UTF16),(*UTF32), and +- (*UCP) leading sequences that can be used to set UTF and Unicode prop- +- erty modes; they are equivalent to setting the PCRE_UTF8, PCRE_UTF16, +- PCRE_UTF32 and the PCRE_UCP options, respectively. The (*UTF) sequence +- is a generic version that can be used with any of the libraries. How- +- ever, the application can set the PCRE_NEVER_UTF option, which locks ++ Note: There are other PCRE-specific options that can be set by the ++ application when the compiling or matching functions are called. In ++ some cases the pattern can contain special leading sequences such as ++ (*CRLF) to override what the application has set or what has been ++ defaulted. Details are given in the section entitled "Newline ++ sequences" above. There are also the (*UTF8), (*UTF16),(*UTF32), and ++ (*UCP) leading sequences that can be used to set UTF and Unicode prop- ++ erty modes; they are equivalent to setting the PCRE_UTF8, PCRE_UTF16, ++ PCRE_UTF32 and the PCRE_UCP options, respectively. The (*UTF) sequence ++ is a generic version that can be used with any of the libraries. How- ++ ever, the application can set the PCRE_NEVER_UTF option, which locks + out the use of the (*UTF) sequences. + + +@@ -6023,18 +6047,18 @@ SUBPATTERNS + + cat(aract|erpillar|) + +- matches "cataract", "caterpillar", or "cat". Without the parentheses, ++ matches "cataract", "caterpillar", or "cat". Without the parentheses, + it would match "cataract", "erpillar" or an empty string. + +- 2. It sets up the subpattern as a capturing subpattern. This means +- that, when the whole pattern matches, that portion of the subject ++ 2. It sets up the subpattern as a capturing subpattern. This means ++ that, when the whole pattern matches, that portion of the subject + string that matched the subpattern is passed back to the caller via the +- ovector argument of the matching function. (This applies only to the +- traditional matching functions; the DFA matching functions do not sup- ++ ovector argument of the matching function. (This applies only to the ++ traditional matching functions; the DFA matching functions do not sup- + port capturing.) + + Opening parentheses are counted from left to right (starting from 1) to +- obtain numbers for the capturing subpatterns. For example, if the ++ obtain numbers for the capturing subpatterns. For example, if the + string "the red king" is matched against the pattern + + the ((red|white) (king|queen)) +@@ -6042,12 +6066,12 @@ SUBPATTERNS + the captured substrings are "red king", "red", and "king", and are num- + bered 1, 2, and 3, respectively. + +- The fact that plain parentheses fulfil two functions is not always +- helpful. There are often times when a grouping subpattern is required +- without a capturing requirement. If an opening parenthesis is followed +- by a question mark and a colon, the subpattern does not do any captur- +- ing, and is not counted when computing the number of any subsequent +- capturing subpatterns. For example, if the string "the white queen" is ++ The fact that plain parentheses fulfil two functions is not always ++ helpful. There are often times when a grouping subpattern is required ++ without a capturing requirement. If an opening parenthesis is followed ++ by a question mark and a colon, the subpattern does not do any captur- ++ ing, and is not counted when computing the number of any subsequent ++ capturing subpatterns. For example, if the string "the white queen" is + matched against the pattern + + the ((?:red|white) (king|queen)) +@@ -6055,37 +6079,37 @@ SUBPATTERNS + the captured substrings are "white queen" and "queen", and are numbered + 1 and 2. The maximum number of capturing subpatterns is 65535. + +- As a convenient shorthand, if any option settings are required at the +- start of a non-capturing subpattern, the option letters may appear ++ As a convenient shorthand, if any option settings are required at the ++ start of a non-capturing subpattern, the option letters may appear + between the "?" and the ":". Thus the two patterns + + (?i:saturday|sunday) + (?:(?i)saturday|sunday) + + match exactly the same set of strings. Because alternative branches are +- tried from left to right, and options are not reset until the end of +- the subpattern is reached, an option setting in one branch does affect +- subsequent branches, so the above patterns match "SUNDAY" as well as ++ tried from left to right, and options are not reset until the end of ++ the subpattern is reached, an option setting in one branch does affect ++ subsequent branches, so the above patterns match "SUNDAY" as well as + "Saturday". + + + DUPLICATE SUBPATTERN NUMBERS + + Perl 5.10 introduced a feature whereby each alternative in a subpattern +- uses the same numbers for its capturing parentheses. Such a subpattern +- starts with (?| and is itself a non-capturing subpattern. For example, ++ uses the same numbers for its capturing parentheses. Such a subpattern ++ starts with (?| and is itself a non-capturing subpattern. For example, + consider this pattern: + + (?|(Sat)ur|(Sun))day + +- Because the two alternatives are inside a (?| group, both sets of cap- +- turing parentheses are numbered one. Thus, when the pattern matches, +- you can look at captured substring number one, whichever alternative +- matched. This construct is useful when you want to capture part, but ++ Because the two alternatives are inside a (?| group, both sets of cap- ++ turing parentheses are numbered one. Thus, when the pattern matches, ++ you can look at captured substring number one, whichever alternative ++ matched. This construct is useful when you want to capture part, but + not all, of one of a number of alternatives. Inside a (?| group, paren- +- theses are numbered as usual, but the number is reset at the start of +- each branch. The numbers of any capturing parentheses that follow the +- subpattern start after the highest number used in any branch. The fol- ++ theses are numbered as usual, but the number is reset at the start of ++ each branch. The numbers of any capturing parentheses that follow the ++ subpattern start after the highest number used in any branch. The fol- + lowing example is taken from the Perl documentation. The numbers under- + neath show in which buffer the captured content will be stored. + +@@ -6093,58 +6117,58 @@ DUPLICATE SUBPATTERN NUMBERS + / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x + # 1 2 2 3 2 3 4 + +- A back reference to a numbered subpattern uses the most recent value +- that is set for that number by any subpattern. The following pattern ++ A back reference to a numbered subpattern uses the most recent value ++ that is set for that number by any subpattern. The following pattern + matches "abcabc" or "defdef": + + /(?|(abc)|(def))\1/ + +- In contrast, a subroutine call to a numbered subpattern always refers +- to the first one in the pattern with the given number. The following ++ In contrast, a subroutine call to a numbered subpattern always refers ++ to the first one in the pattern with the given number. The following + pattern matches "abcabc" or "defabc": + + /(?|(abc)|(def))(?1)/ + +- If a condition test for a subpattern's having matched refers to a non- +- unique number, the test is true if any of the subpatterns of that num- ++ If a condition test for a subpattern's having matched refers to a non- ++ unique number, the test is true if any of the subpatterns of that num- + ber have matched. + +- An alternative approach to using this "branch reset" feature is to use ++ An alternative approach to using this "branch reset" feature is to use + duplicate named subpatterns, as described in the next section. + + + NAMED SUBPATTERNS + +- Identifying capturing parentheses by number is simple, but it can be +- very hard to keep track of the numbers in complicated regular expres- +- sions. Furthermore, if an expression is modified, the numbers may +- change. To help with this difficulty, PCRE supports the naming of sub- ++ Identifying capturing parentheses by number is simple, but it can be ++ very hard to keep track of the numbers in complicated regular expres- ++ sions. Furthermore, if an expression is modified, the numbers may ++ change. To help with this difficulty, PCRE supports the naming of sub- + patterns. This feature was not added to Perl until release 5.10. Python +- had the feature earlier, and PCRE introduced it at release 4.0, using +- the Python syntax. PCRE now supports both the Perl and the Python syn- +- tax. Perl allows identically numbered subpatterns to have different ++ had the feature earlier, and PCRE introduced it at release 4.0, using ++ the Python syntax. PCRE now supports both the Perl and the Python syn- ++ tax. Perl allows identically numbered subpatterns to have different + names, but PCRE does not. + +- In PCRE, a subpattern can be named in one of three ways: (?...) +- or (?'name'...) as in Perl, or (?P...) as in Python. References +- to capturing parentheses from other parts of the pattern, such as back +- references, recursion, and conditions, can be made by name as well as ++ In PCRE, a subpattern can be named in one of three ways: (?...) ++ or (?'name'...) as in Perl, or (?P...) as in Python. References ++ to capturing parentheses from other parts of the pattern, such as back ++ references, recursion, and conditions, can be made by name as well as + by number. + +- Names consist of up to 32 alphanumeric characters and underscores, but +- must start with a non-digit. Named capturing parentheses are still +- allocated numbers as well as names, exactly as if the names were not +- present. The PCRE API provides function calls for extracting the name- +- to-number translation table from a compiled pattern. There is also a ++ Names consist of up to 32 alphanumeric characters and underscores, but ++ must start with a non-digit. Named capturing parentheses are still ++ allocated numbers as well as names, exactly as if the names were not ++ present. The PCRE API provides function calls for extracting the name- ++ to-number translation table from a compiled pattern. There is also a + convenience function for extracting a captured substring by name. + +- By default, a name must be unique within a pattern, but it is possible ++ By default, a name must be unique within a pattern, but it is possible + to relax this constraint by setting the PCRE_DUPNAMES option at compile +- time. (Duplicate names are also always permitted for subpatterns with +- the same number, set up as described in the previous section.) Dupli- +- cate names can be useful for patterns where only one instance of the +- named parentheses can match. Suppose you want to match the name of a +- weekday, either as a 3-letter abbreviation or as the full name, and in ++ time. (Duplicate names are also always permitted for subpatterns with ++ the same number, set up as described in the previous section.) Dupli- ++ cate names can be useful for patterns where only one instance of the ++ named parentheses can match. Suppose you want to match the name of a ++ weekday, either as a 3-letter abbreviation or as the full name, and in + both cases you want to extract the abbreviation. This pattern (ignoring + the line breaks) does the job: + +@@ -6154,18 +6178,18 @@ NAMED SUBPATTERNS + (?Thu)(?:rsday)?| + (?Sat)(?:urday)? + +- There are five capturing substrings, but only one is ever set after a ++ There are five capturing substrings, but only one is ever set after a + match. (An alternative way of solving this problem is to use a "branch + reset" subpattern, as described in the previous section.) + +- The convenience function for extracting the data by name returns the +- substring for the first (and in this example, the only) subpattern of +- that name that matched. This saves searching to find which numbered ++ The convenience function for extracting the data by name returns the ++ substring for the first (and in this example, the only) subpattern of ++ that name that matched. This saves searching to find which numbered + subpattern it was. + +- If you make a back reference to a non-unique named subpattern from +- elsewhere in the pattern, the subpatterns to which the name refers are +- checked in the order in which they appear in the overall pattern. The ++ If you make a back reference to a non-unique named subpattern from ++ elsewhere in the pattern, the subpatterns to which the name refers are ++ checked in the order in which they appear in the overall pattern. The + first one that is set is used for the reference. For example, this pat- + tern matches both "foofoo" and "barbar" but not "foobar" or "barfoo": + +@@ -6173,29 +6197,29 @@ NAMED SUBPATTERNS + + + If you make a subroutine call to a non-unique named subpattern, the one +- that corresponds to the first occurrence of the name is used. In the ++ that corresponds to the first occurrence of the name is used. In the + absence of duplicate numbers (see the previous section) this is the one + with the lowest number. + + If you use a named reference in a condition test (see the section about + conditions below), either to check whether a subpattern has matched, or +- to check for recursion, all subpatterns with the same name are tested. +- If the condition is true for any one of them, the overall condition is +- true. This is the same behaviour as testing by number. For further +- details of the interfaces for handling named subpatterns, see the ++ to check for recursion, all subpatterns with the same name are tested. ++ If the condition is true for any one of them, the overall condition is ++ true. This is the same behaviour as testing by number. For further ++ details of the interfaces for handling named subpatterns, see the + pcreapi documentation. + + Warning: You cannot use different names to distinguish between two sub- +- patterns with the same number because PCRE uses only the numbers when ++ patterns with the same number because PCRE uses only the numbers when + matching. For this reason, an error is given at compile time if differ- +- ent names are given to subpatterns with the same number. However, you ++ ent names are given to subpatterns with the same number. However, you + can always give the same name to subpatterns with the same number, even + when PCRE_DUPNAMES is not set. + + + REPETITION + +- Repetition is specified by quantifiers, which can follow any of the ++ Repetition is specified by quantifiers, which can follow any of the + following items: + + a literal data character +@@ -6209,17 +6233,17 @@ REPETITION + a parenthesized subpattern (including assertions) + a subroutine call to a subpattern (recursive or otherwise) + +- The general repetition quantifier specifies a minimum and maximum num- +- ber of permitted matches, by giving the two numbers in curly brackets +- (braces), separated by a comma. The numbers must be less than 65536, ++ The general repetition quantifier specifies a minimum and maximum num- ++ ber of permitted matches, by giving the two numbers in curly brackets ++ (braces), separated by a comma. The numbers must be less than 65536, + and the first must be less than or equal to the second. For example: + + z{2,4} + +- matches "zz", "zzz", or "zzzz". A closing brace on its own is not a +- special character. If the second number is omitted, but the comma is +- present, there is no upper limit; if the second number and the comma +- are both omitted, the quantifier specifies an exact number of required ++ matches "zz", "zzz", or "zzzz". A closing brace on its own is not a ++ special character. If the second number is omitted, but the comma is ++ present, there is no upper limit; if the second number and the comma ++ are both omitted, the quantifier specifies an exact number of required + matches. Thus + + [aeiou]{3,} +@@ -6228,50 +6252,50 @@ REPETITION + + \d{8} + +- matches exactly 8 digits. An opening curly bracket that appears in a +- position where a quantifier is not allowed, or one that does not match +- the syntax of a quantifier, is taken as a literal character. For exam- ++ matches exactly 8 digits. An opening curly bracket that appears in a ++ position where a quantifier is not allowed, or one that does not match ++ the syntax of a quantifier, is taken as a literal character. For exam- + ple, {,6} is not a quantifier, but a literal string of four characters. + + In UTF modes, quantifiers apply to characters rather than to individual +- data units. Thus, for example, \x{100}{2} matches two characters, each ++ data units. Thus, for example, \x{100}{2} matches two characters, each + of which is represented by a two-byte sequence in a UTF-8 string. Simi- +- larly, \X{3} matches three Unicode extended grapheme clusters, each of +- which may be several data units long (and they may be of different ++ larly, \X{3} matches three Unicode extended grapheme clusters, each of ++ which may be several data units long (and they may be of different + lengths). + + The quantifier {0} is permitted, causing the expression to behave as if + the previous item and the quantifier were not present. This may be use- +- ful for subpatterns that are referenced as subroutines from elsewhere ++ ful for subpatterns that are referenced as subroutines from elsewhere + in the pattern (but see also the section entitled "Defining subpatterns +- for use by reference only" below). Items other than subpatterns that ++ for use by reference only" below). Items other than subpatterns that + have a {0} quantifier are omitted from the compiled pattern. + +- For convenience, the three most common quantifiers have single-charac- ++ For convenience, the three most common quantifiers have single-charac- + ter abbreviations: + + * is equivalent to {0,} + + is equivalent to {1,} + ? is equivalent to {0,1} + +- It is possible to construct infinite loops by following a subpattern ++ It is possible to construct infinite loops by following a subpattern + that can match no characters with a quantifier that has no upper limit, + for example: + + (a?)* + + Earlier versions of Perl and PCRE used to give an error at compile time +- for such patterns. However, because there are cases where this can be +- useful, such patterns are now accepted, but if any repetition of the +- subpattern does in fact match no characters, the loop is forcibly bro- ++ for such patterns. However, because there are cases where this can be ++ useful, such patterns are now accepted, but if any repetition of the ++ subpattern does in fact match no characters, the loop is forcibly bro- + ken. + +- By default, the quantifiers are "greedy", that is, they match as much +- as possible (up to the maximum number of permitted times), without +- causing the rest of the pattern to fail. The classic example of where ++ By default, the quantifiers are "greedy", that is, they match as much ++ as possible (up to the maximum number of permitted times), without ++ causing the rest of the pattern to fail. The classic example of where + this gives problems is in trying to match comments in C programs. These +- appear between /* and */ and within the comment, individual * and / +- characters may appear. An attempt to match C comments by applying the ++ appear between /* and */ and within the comment, individual * and / ++ characters may appear. An attempt to match C comments by applying the + pattern + + /\*.*\*/ +@@ -6280,19 +6304,19 @@ REPETITION + + /* first comment */ not comment /* second comment */ + +- fails, because it matches the entire string owing to the greediness of ++ fails, because it matches the entire string owing to the greediness of + the .* item. + +- However, if a quantifier is followed by a question mark, it ceases to ++ However, if a quantifier is followed by a question mark, it ceases to + be greedy, and instead matches the minimum number of times possible, so + the pattern + + /\*.*?\*/ + +- does the right thing with the C comments. The meaning of the various +- quantifiers is not otherwise changed, just the preferred number of +- matches. Do not confuse this use of question mark with its use as a +- quantifier in its own right. Because it has two uses, it can sometimes ++ does the right thing with the C comments. The meaning of the various ++ quantifiers is not otherwise changed, just the preferred number of ++ matches. Do not confuse this use of question mark with its use as a ++ quantifier in its own right. Because it has two uses, it can sometimes + appear doubled, as in + + \d??\d +@@ -6300,45 +6324,45 @@ REPETITION + which matches one digit by preference, but can match two if that is the + only way the rest of the pattern matches. + +- If the PCRE_UNGREEDY option is set (an option that is not available in +- Perl), the quantifiers are not greedy by default, but individual ones +- can be made greedy by following them with a question mark. In other ++ If the PCRE_UNGREEDY option is set (an option that is not available in ++ Perl), the quantifiers are not greedy by default, but individual ones ++ can be made greedy by following them with a question mark. In other + words, it inverts the default behaviour. + +- When a parenthesized subpattern is quantified with a minimum repeat +- count that is greater than 1 or with a limited maximum, more memory is +- required for the compiled pattern, in proportion to the size of the ++ When a parenthesized subpattern is quantified with a minimum repeat ++ count that is greater than 1 or with a limited maximum, more memory is ++ required for the compiled pattern, in proportion to the size of the + minimum or maximum. + + If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equiv- +- alent to Perl's /s) is set, thus allowing the dot to match newlines, +- the pattern is implicitly anchored, because whatever follows will be +- tried against every character position in the subject string, so there +- is no point in retrying the overall match at any position after the +- first. PCRE normally treats such a pattern as though it were preceded ++ alent to Perl's /s) is set, thus allowing the dot to match newlines, ++ the pattern is implicitly anchored, because whatever follows will be ++ tried against every character position in the subject string, so there ++ is no point in retrying the overall match at any position after the ++ first. PCRE normally treats such a pattern as though it were preceded + by \A. + +- In cases where it is known that the subject string contains no new- +- lines, it is worth setting PCRE_DOTALL in order to obtain this opti- ++ In cases where it is known that the subject string contains no new- ++ lines, it is worth setting PCRE_DOTALL in order to obtain this opti- + mization, or alternatively using ^ to indicate anchoring explicitly. + +- However, there are some cases where the optimization cannot be used. ++ However, there are some cases where the optimization cannot be used. + When .* is inside capturing parentheses that are the subject of a back + reference elsewhere in the pattern, a match at the start may fail where + a later one succeeds. Consider, for example: + + (.*)abc\1 + +- If the subject is "xyz123abc123" the match point is the fourth charac- ++ If the subject is "xyz123abc123" the match point is the fourth charac- + ter. For this reason, such a pattern is not implicitly anchored. + +- Another case where implicit anchoring is not applied is when the lead- +- ing .* is inside an atomic group. Once again, a match at the start may ++ Another case where implicit anchoring is not applied is when the lead- ++ ing .* is inside an atomic group. Once again, a match at the start may + fail where a later one succeeds. Consider this pattern: + + (?>.*?a)b + +- It matches "ab" in the subject "aab". The use of the backtracking con- ++ It matches "ab" in the subject "aab". The use of the backtracking con- + trol verbs (*PRUNE) and (*SKIP) also disable this optimization. + + When a capturing subpattern is repeated, the value captured is the sub- +@@ -6347,8 +6371,8 @@ REPETITION + (tweedle[dume]{3}\s*)+ + + has matched "tweedledum tweedledee" the value of the captured substring +- is "tweedledee". However, if there are nested capturing subpatterns, +- the corresponding captured values may have been set in previous itera- ++ is "tweedledee". However, if there are nested capturing subpatterns, ++ the corresponding captured values may have been set in previous itera- + tions. For example, after + + /(a|(b))+/ +@@ -6358,53 +6382,53 @@ REPETITION + + ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS + +- With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") +- repetition, failure of what follows normally causes the repeated item +- to be re-evaluated to see if a different number of repeats allows the +- rest of the pattern to match. Sometimes it is useful to prevent this, +- either to change the nature of the match, or to cause it fail earlier +- than it otherwise might, when the author of the pattern knows there is ++ With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") ++ repetition, failure of what follows normally causes the repeated item ++ to be re-evaluated to see if a different number of repeats allows the ++ rest of the pattern to match. Sometimes it is useful to prevent this, ++ either to change the nature of the match, or to cause it fail earlier ++ than it otherwise might, when the author of the pattern knows there is + no point in carrying on. + +- Consider, for example, the pattern \d+foo when applied to the subject ++ Consider, for example, the pattern \d+foo when applied to the subject + line + + 123456bar + + After matching all 6 digits and then failing to match "foo", the normal +- action of the matcher is to try again with only 5 digits matching the +- \d+ item, and then with 4, and so on, before ultimately failing. +- "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides +- the means for specifying that once a subpattern has matched, it is not ++ action of the matcher is to try again with only 5 digits matching the ++ \d+ item, and then with 4, and so on, before ultimately failing. ++ "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides ++ the means for specifying that once a subpattern has matched, it is not + to be re-evaluated in this way. + +- If we use atomic grouping for the previous example, the matcher gives +- up immediately on failing to match "foo" the first time. The notation ++ If we use atomic grouping for the previous example, the matcher gives ++ up immediately on failing to match "foo" the first time. The notation + is a kind of special parenthesis, starting with (?> as in this example: + + (?>\d+)foo + +- This kind of parenthesis "locks up" the part of the pattern it con- +- tains once it has matched, and a failure further into the pattern is +- prevented from backtracking into it. Backtracking past it to previous ++ This kind of parenthesis "locks up" the part of the pattern it con- ++ tains once it has matched, and a failure further into the pattern is ++ prevented from backtracking into it. Backtracking past it to previous + items, however, works as normal. + +- An alternative description is that a subpattern of this type matches +- the string of characters that an identical standalone pattern would ++ An alternative description is that a subpattern of this type matches ++ the string of characters that an identical standalone pattern would + match, if anchored at the current point in the subject string. + + Atomic grouping subpatterns are not capturing subpatterns. Simple cases + such as the above example can be thought of as a maximizing repeat that +- must swallow everything it can. So, while both \d+ and \d+? are pre- +- pared to adjust the number of digits they match in order to make the ++ must swallow everything it can. So, while both \d+ and \d+? are pre- ++ pared to adjust the number of digits they match in order to make the + rest of the pattern match, (?>\d+) can only match an entire sequence of + digits. + +- Atomic groups in general can of course contain arbitrarily complicated +- subpatterns, and can be nested. However, when the subpattern for an ++ Atomic groups in general can of course contain arbitrarily complicated ++ subpatterns, and can be nested. However, when the subpattern for an + atomic group is just a single repeated item, as in the example above, a +- simpler notation, called a "possessive quantifier" can be used. This +- consists of an additional + character following a quantifier. Using ++ simpler notation, called a "possessive quantifier" can be used. This ++ consists of an additional + character following a quantifier. Using + this notation, the previous example can be rewritten as + + \d++foo +@@ -6414,45 +6438,45 @@ ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS + + (abc|xyz){2,3}+ + +- Possessive quantifiers are always greedy; the setting of the ++ Possessive quantifiers are always greedy; the setting of the + PCRE_UNGREEDY option is ignored. They are a convenient notation for the +- simpler forms of atomic group. However, there is no difference in the +- meaning of a possessive quantifier and the equivalent atomic group, +- though there may be a performance difference; possessive quantifiers ++ simpler forms of atomic group. However, there is no difference in the ++ meaning of a possessive quantifier and the equivalent atomic group, ++ though there may be a performance difference; possessive quantifiers + should be slightly faster. + +- The possessive quantifier syntax is an extension to the Perl 5.8 syn- +- tax. Jeffrey Friedl originated the idea (and the name) in the first ++ The possessive quantifier syntax is an extension to the Perl 5.8 syn- ++ tax. Jeffrey Friedl originated the idea (and the name) in the first + edition of his book. Mike McCloskey liked it, so implemented it when he +- built Sun's Java package, and PCRE copied it from there. It ultimately ++ built Sun's Java package, and PCRE copied it from there. It ultimately + found its way into Perl at release 5.10. + + PCRE has an optimization that automatically "possessifies" certain sim- +- ple pattern constructs. For example, the sequence A+B is treated as +- A++B because there is no point in backtracking into a sequence of A's ++ ple pattern constructs. For example, the sequence A+B is treated as ++ A++B because there is no point in backtracking into a sequence of A's + when B must follow. + +- When a pattern contains an unlimited repeat inside a subpattern that +- can itself be repeated an unlimited number of times, the use of an +- atomic group is the only way to avoid some failing matches taking a ++ When a pattern contains an unlimited repeat inside a subpattern that ++ can itself be repeated an unlimited number of times, the use of an ++ atomic group is the only way to avoid some failing matches taking a + very long time indeed. The pattern + + (\D+|<\d+>)*[!?] + +- matches an unlimited number of substrings that either consist of non- +- digits, or digits enclosed in <>, followed by either ! or ?. When it ++ matches an unlimited number of substrings that either consist of non- ++ digits, or digits enclosed in <>, followed by either ! or ?. When it + matches, it runs quickly. However, if it is applied to + + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + +- it takes a long time before reporting failure. This is because the +- string can be divided between the internal \D+ repeat and the external +- * repeat in a large number of ways, and all have to be tried. (The +- example uses [!?] rather than a single character at the end, because +- both PCRE and Perl have an optimization that allows for fast failure +- when a single character is used. They remember the last single charac- +- ter that is required for a match, and fail early if it is not present +- in the string.) If the pattern is changed so that it uses an atomic ++ it takes a long time before reporting failure. This is because the ++ string can be divided between the internal \D+ repeat and the external ++ * repeat in a large number of ways, and all have to be tried. (The ++ example uses [!?] rather than a single character at the end, because ++ both PCRE and Perl have an optimization that allows for fast failure ++ when a single character is used. They remember the last single charac- ++ ter that is required for a match, and fail early if it is not present ++ in the string.) If the pattern is changed so that it uses an atomic + group, like this: + + ((?>\D+)|<\d+>)*[!?] +@@ -6464,28 +6488,28 @@ BACK REFERENCES + + Outside a character class, a backslash followed by a digit greater than + 0 (and possibly further digits) is a back reference to a capturing sub- +- pattern earlier (that is, to its left) in the pattern, provided there ++ pattern earlier (that is, to its left) in the pattern, provided there + have been that many previous capturing left parentheses. + + However, if the decimal number following the backslash is less than 10, +- it is always taken as a back reference, and causes an error only if +- there are not that many capturing left parentheses in the entire pat- +- tern. In other words, the parentheses that are referenced need not be +- to the left of the reference for numbers less than 10. A "forward back +- reference" of this type can make sense when a repetition is involved +- and the subpattern to the right has participated in an earlier itera- ++ it is always taken as a back reference, and causes an error only if ++ there are not that many capturing left parentheses in the entire pat- ++ tern. In other words, the parentheses that are referenced need not be ++ to the left of the reference for numbers less than 10. A "forward back ++ reference" of this type can make sense when a repetition is involved ++ and the subpattern to the right has participated in an earlier itera- + tion. + +- It is not possible to have a numerical "forward back reference" to a +- subpattern whose number is 10 or more using this syntax because a +- sequence such as \50 is interpreted as a character defined in octal. ++ It is not possible to have a numerical "forward back reference" to a ++ subpattern whose number is 10 or more using this syntax because a ++ sequence such as \50 is interpreted as a character defined in octal. + See the subsection entitled "Non-printing characters" above for further +- details of the handling of digits following a backslash. There is no +- such problem when named parentheses are used. A back reference to any ++ details of the handling of digits following a backslash. There is no ++ such problem when named parentheses are used. A back reference to any + subpattern is possible using named parentheses (see below). + +- Another way of avoiding the ambiguity inherent in the use of digits +- following a backslash is to use the \g escape sequence. This escape ++ Another way of avoiding the ambiguity inherent in the use of digits ++ following a backslash is to use the \g escape sequence. This escape + must be followed by an unsigned number or a negative number, optionally + enclosed in braces. These examples are all identical: + +@@ -6493,7 +6517,7 @@ BACK REFERENCES + (ring), \g1 + (ring), \g{1} + +- An unsigned number specifies an absolute reference without the ambigu- ++ An unsigned number specifies an absolute reference without the ambigu- + ity that is present in the older syntax. It is also useful when literal + digits follow the reference. A negative number is a relative reference. + Consider this example: +@@ -6502,33 +6526,33 @@ BACK REFERENCES + + The sequence \g{-1} is a reference to the most recently started captur- + ing subpattern before \g, that is, is it equivalent to \2 in this exam- +- ple. Similarly, \g{-2} would be equivalent to \1. The use of relative +- references can be helpful in long patterns, and also in patterns that +- are created by joining together fragments that contain references ++ ple. Similarly, \g{-2} would be equivalent to \1. The use of relative ++ references can be helpful in long patterns, and also in patterns that ++ are created by joining together fragments that contain references + within themselves. + +- A back reference matches whatever actually matched the capturing sub- +- pattern in the current subject string, rather than anything matching ++ A back reference matches whatever actually matched the capturing sub- ++ pattern in the current subject string, rather than anything matching + the subpattern itself (see "Subpatterns as subroutines" below for a way + of doing that). So the pattern + + (sens|respons)e and \1ibility + +- matches "sense and sensibility" and "response and responsibility", but +- not "sense and responsibility". If caseful matching is in force at the +- time of the back reference, the case of letters is relevant. For exam- ++ matches "sense and sensibility" and "response and responsibility", but ++ not "sense and responsibility". If caseful matching is in force at the ++ time of the back reference, the case of letters is relevant. For exam- + ple, + + ((?i)rah)\s+\1 + +- matches "rah rah" and "RAH RAH", but not "RAH rah", even though the ++ matches "rah rah" and "RAH RAH", but not "RAH rah", even though the + original capturing subpattern is matched caselessly. + +- There are several different ways of writing back references to named +- subpatterns. The .NET syntax \k{name} and the Perl syntax \k or +- \k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's ++ There are several different ways of writing back references to named ++ subpatterns. The .NET syntax \k{name} and the Perl syntax \k or ++ \k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's + unified back reference syntax, in which \g can be used for both numeric +- and named references, is also supported. We could rewrite the above ++ and named references, is also supported. We could rewrite the above + example in any of the following ways: + + (?(?i)rah)\s+\k +@@ -6536,84 +6560,84 @@ BACK REFERENCES + (?P(?i)rah)\s+(?P=p1) + (?(?i)rah)\s+\g{p1} + +- A subpattern that is referenced by name may appear in the pattern ++ A subpattern that is referenced by name may appear in the pattern + before or after the reference. + +- There may be more than one back reference to the same subpattern. If a +- subpattern has not actually been used in a particular match, any back ++ There may be more than one back reference to the same subpattern. If a ++ subpattern has not actually been used in a particular match, any back + references to it always fail by default. For example, the pattern + + (a|(bc))\2 + +- always fails if it starts to match "a" rather than "bc". However, if ++ always fails if it starts to match "a" rather than "bc". However, if + the PCRE_JAVASCRIPT_COMPAT option is set at compile time, a back refer- + ence to an unset value matches an empty string. + +- Because there may be many capturing parentheses in a pattern, all dig- +- its following a backslash are taken as part of a potential back refer- +- ence number. If the pattern continues with a digit character, some +- delimiter must be used to terminate the back reference. If the +- PCRE_EXTENDED option is set, this can be white space. Otherwise, the ++ Because there may be many capturing parentheses in a pattern, all dig- ++ its following a backslash are taken as part of a potential back refer- ++ ence number. If the pattern continues with a digit character, some ++ delimiter must be used to terminate the back reference. If the ++ PCRE_EXTENDED option is set, this can be white space. Otherwise, the + \g{ syntax or an empty comment (see "Comments" below) can be used. + + Recursive back references + +- A back reference that occurs inside the parentheses to which it refers +- fails when the subpattern is first used, so, for example, (a\1) never +- matches. However, such references can be useful inside repeated sub- ++ A back reference that occurs inside the parentheses to which it refers ++ fails when the subpattern is first used, so, for example, (a\1) never ++ matches. However, such references can be useful inside repeated sub- + patterns. For example, the pattern + + (a|b\1)+ + + matches any number of "a"s and also "aba", "ababbaa" etc. At each iter- +- ation of the subpattern, the back reference matches the character +- string corresponding to the previous iteration. In order for this to +- work, the pattern must be such that the first iteration does not need +- to match the back reference. This can be done using alternation, as in ++ ation of the subpattern, the back reference matches the character ++ string corresponding to the previous iteration. In order for this to ++ work, the pattern must be such that the first iteration does not need ++ to match the back reference. This can be done using alternation, as in + the example above, or by a quantifier with a minimum of zero. + +- Back references of this type cause the group that they reference to be +- treated as an atomic group. Once the whole group has been matched, a +- subsequent matching failure cannot cause backtracking into the middle ++ Back references of this type cause the group that they reference to be ++ treated as an atomic group. Once the whole group has been matched, a ++ subsequent matching failure cannot cause backtracking into the middle + of the group. + + + ASSERTIONS + +- An assertion is a test on the characters following or preceding the +- current matching point that does not actually consume any characters. +- The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are ++ An assertion is a test on the characters following or preceding the ++ current matching point that does not actually consume any characters. ++ The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are + described above. + +- More complicated assertions are coded as subpatterns. There are two +- kinds: those that look ahead of the current position in the subject +- string, and those that look behind it. An assertion subpattern is +- matched in the normal way, except that it does not cause the current ++ More complicated assertions are coded as subpatterns. There are two ++ kinds: those that look ahead of the current position in the subject ++ string, and those that look behind it. An assertion subpattern is ++ matched in the normal way, except that it does not cause the current + matching position to be changed. + +- Assertion subpatterns are not capturing subpatterns. If such an asser- +- tion contains capturing subpatterns within it, these are counted for +- the purposes of numbering the capturing subpatterns in the whole pat- +- tern. However, substring capturing is carried out only for positive ++ Assertion subpatterns are not capturing subpatterns. If such an asser- ++ tion contains capturing subpatterns within it, these are counted for ++ the purposes of numbering the capturing subpatterns in the whole pat- ++ tern. However, substring capturing is carried out only for positive + assertions. (Perl sometimes, but not always, does do capturing in nega- + tive assertions.) + +- For compatibility with Perl, assertion subpatterns may be repeated; +- though it makes no sense to assert the same thing several times, the +- side effect of capturing parentheses may occasionally be useful. In ++ For compatibility with Perl, assertion subpatterns may be repeated; ++ though it makes no sense to assert the same thing several times, the ++ side effect of capturing parentheses may occasionally be useful. In + practice, there only three cases: + +- (1) If the quantifier is {0}, the assertion is never obeyed during +- matching. However, it may contain internal capturing parenthesized ++ (1) If the quantifier is {0}, the assertion is never obeyed during ++ matching. However, it may contain internal capturing parenthesized + groups that are called from elsewhere via the subroutine mechanism. + +- (2) If quantifier is {0,n} where n is greater than zero, it is treated +- as if it were {0,1}. At run time, the rest of the pattern match is ++ (2) If quantifier is {0,n} where n is greater than zero, it is treated ++ as if it were {0,1}. At run time, the rest of the pattern match is + tried with and without the assertion, the order depending on the greed- + iness of the quantifier. + +- (3) If the minimum repetition is greater than zero, the quantifier is +- ignored. The assertion is obeyed just once when encountered during ++ (3) If the minimum repetition is greater than zero, the quantifier is ++ ignored. The assertion is obeyed just once when encountered during + matching. + + Lookahead assertions +@@ -6623,38 +6647,38 @@ ASSERTIONS + + \w+(?=;) + +- matches a word followed by a semicolon, but does not include the semi- ++ matches a word followed by a semicolon, but does not include the semi- + colon in the match, and + + foo(?!bar) + +- matches any occurrence of "foo" that is not followed by "bar". Note ++ matches any occurrence of "foo" that is not followed by "bar". Note + that the apparently similar pattern + + (?!foo)bar + +- does not find an occurrence of "bar" that is preceded by something +- other than "foo"; it finds any occurrence of "bar" whatsoever, because ++ does not find an occurrence of "bar" that is preceded by something ++ other than "foo"; it finds any occurrence of "bar" whatsoever, because + the assertion (?!foo) is always true when the next three characters are + "bar". A lookbehind assertion is needed to achieve the other effect. + + If you want to force a matching failure at some point in a pattern, the +- most convenient way to do it is with (?!) because an empty string +- always matches, so an assertion that requires there not to be an empty ++ most convenient way to do it is with (?!) because an empty string ++ always matches, so an assertion that requires there not to be an empty + string must always fail. The backtracking control verb (*FAIL) or (*F) + is a synonym for (?!). + + Lookbehind assertions + +- Lookbehind assertions start with (?<= for positive assertions and (?)...) or (?('name')...) to test for a +- used subpattern by name. For compatibility with earlier versions of +- PCRE, which had this facility before Perl, the syntax (?(name)...) is ++ Perl uses the syntax (?()...) or (?('name')...) to test for a ++ used subpattern by name. For compatibility with earlier versions of ++ PCRE, which had this facility before Perl, the syntax (?(name)...) is + also recognized. + + Rewriting the above example to use a named subpattern gives this: + + (? \( )? [^()]+ (?() \) ) + +- If the name used in a condition of this kind is a duplicate, the test +- is applied to all subpatterns of the same name, and is true if any one ++ If the name used in a condition of this kind is a duplicate, the test ++ is applied to all subpatterns of the same name, and is true if any one + of them has matched. + + Checking for pattern recursion + + If the condition is the string (R), and there is no subpattern with the +- name R, the condition is true if a recursive call to the whole pattern ++ name R, the condition is true if a recursive call to the whole pattern + or any subpattern has been made. If digits or a name preceded by amper- + sand follow the letter R, for example: + +@@ -6846,51 +6870,51 @@ CONDITIONAL SUBPATTERNS + + the condition is true if the most recent recursion is into a subpattern + whose number or name is given. This condition does not check the entire +- recursion stack. If the name used in a condition of this kind is a ++ recursion stack. If the name used in a condition of this kind is a + duplicate, the test is applied to all subpatterns of the same name, and + is true if any one of them is the most recent recursion. + +- At "top level", all these recursion test conditions are false. The ++ At "top level", all these recursion test conditions are false. The + syntax for recursive patterns is described below. + + Defining subpatterns for use by reference only + +- If the condition is the string (DEFINE), and there is no subpattern +- with the name DEFINE, the condition is always false. In this case, +- there may be only one alternative in the subpattern. It is always +- skipped if control reaches this point in the pattern; the idea of +- DEFINE is that it can be used to define subroutines that can be refer- +- enced from elsewhere. (The use of subroutines is described below.) For +- example, a pattern to match an IPv4 address such as "192.168.23.245" ++ If the condition is the string (DEFINE), and there is no subpattern ++ with the name DEFINE, the condition is always false. In this case, ++ there may be only one alternative in the subpattern. It is always ++ skipped if control reaches this point in the pattern; the idea of ++ DEFINE is that it can be used to define subroutines that can be refer- ++ enced from elsewhere. (The use of subroutines is described below.) For ++ example, a pattern to match an IPv4 address such as "192.168.23.245" + could be written like this (ignore white space and line breaks): + + (?(DEFINE) (? 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) ) + \b (?&byte) (\.(?&byte)){3} \b + +- The first part of the pattern is a DEFINE group inside which a another +- group named "byte" is defined. This matches an individual component of +- an IPv4 address (a number less than 256). When matching takes place, +- this part of the pattern is skipped because DEFINE acts like a false +- condition. The rest of the pattern uses references to the named group +- to match the four dot-separated components of an IPv4 address, insist- ++ The first part of the pattern is a DEFINE group inside which a another ++ group named "byte" is defined. This matches an individual component of ++ an IPv4 address (a number less than 256). When matching takes place, ++ this part of the pattern is skipped because DEFINE acts like a false ++ condition. The rest of the pattern uses references to the named group ++ to match the four dot-separated components of an IPv4 address, insist- + ing on a word boundary at each end. + + Assertion conditions + +- If the condition is not in any of the above formats, it must be an +- assertion. This may be a positive or negative lookahead or lookbehind +- assertion. Consider this pattern, again containing non-significant ++ If the condition is not in any of the above formats, it must be an ++ assertion. This may be a positive or negative lookahead or lookbehind ++ assertion. Consider this pattern, again containing non-significant + white space, and with the two alternatives on the second line: + + (?(?=[^a-z]*[a-z]) + \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} ) + +- The condition is a positive lookahead assertion that matches an +- optional sequence of non-letters followed by a letter. In other words, +- it tests for the presence of at least one letter in the subject. If a +- letter is found, the subject is matched against the first alternative; +- otherwise it is matched against the second. This pattern matches +- strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are ++ The condition is a positive lookahead assertion that matches an ++ optional sequence of non-letters followed by a letter. In other words, ++ it tests for the presence of at least one letter in the subject. If a ++ letter is found, the subject is matched against the first alternative; ++ otherwise it is matched against the second. This pattern matches ++ strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are + letters and dd are digits. + + +@@ -6899,41 +6923,41 @@ COMMENTS + There are two ways of including comments in patterns that are processed + by PCRE. In both cases, the start of the comment must not be in a char- + acter class, nor in the middle of any other sequence of related charac- +- ters such as (?: or a subpattern name or number. The characters that ++ ters such as (?: or a subpattern name or number. The characters that + make up a comment play no part in the pattern matching. + +- The sequence (?# marks the start of a comment that continues up to the +- next closing parenthesis. Nested parentheses are not permitted. If the ++ The sequence (?# marks the start of a comment that continues up to the ++ next closing parenthesis. Nested parentheses are not permitted. If the + PCRE_EXTENDED option is set, an unescaped # character also introduces a +- comment, which in this case continues to immediately after the next +- newline character or character sequence in the pattern. Which charac- ++ comment, which in this case continues to immediately after the next ++ newline character or character sequence in the pattern. Which charac- + ters are interpreted as newlines is controlled by the options passed to +- a compiling function or by a special sequence at the start of the pat- ++ a compiling function or by a special sequence at the start of the pat- + tern, as described in the section entitled "Newline conventions" above. + Note that the end of this type of comment is a literal newline sequence +- in the pattern; escape sequences that happen to represent a newline do +- not count. For example, consider this pattern when PCRE_EXTENDED is ++ in the pattern; escape sequences that happen to represent a newline do ++ not count. For example, consider this pattern when PCRE_EXTENDED is + set, and the default newline convention is in force: + + abc #comment \n still comment + +- On encountering the # character, pcre_compile() skips along, looking +- for a newline in the pattern. The sequence \n is still literal at this +- stage, so it does not terminate the comment. Only an actual character ++ On encountering the # character, pcre_compile() skips along, looking ++ for a newline in the pattern. The sequence \n is still literal at this ++ stage, so it does not terminate the comment. Only an actual character + with the code value 0x0a (the default newline) does so. + + + RECURSIVE PATTERNS + +- Consider the problem of matching a string in parentheses, allowing for +- unlimited nested parentheses. Without the use of recursion, the best +- that can be done is to use a pattern that matches up to some fixed +- depth of nesting. It is not possible to handle an arbitrary nesting ++ Consider the problem of matching a string in parentheses, allowing for ++ unlimited nested parentheses. Without the use of recursion, the best ++ that can be done is to use a pattern that matches up to some fixed ++ depth of nesting. It is not possible to handle an arbitrary nesting + depth. + + For some time, Perl has provided a facility that allows regular expres- +- sions to recurse (amongst other things). It does this by interpolating +- Perl code in the expression at run time, and the code can refer to the ++ sions to recurse (amongst other things). It does this by interpolating ++ Perl code in the expression at run time, and the code can refer to the + expression itself. A Perl pattern using code interpolation to solve the + parentheses problem can be created like this: + +@@ -6943,201 +6967,201 @@ RECURSIVE PATTERNS + refers recursively to the pattern in which it appears. + + Obviously, PCRE cannot support the interpolation of Perl code. Instead, +- it supports special syntax for recursion of the entire pattern, and +- also for individual subpattern recursion. After its introduction in +- PCRE and Python, this kind of recursion was subsequently introduced ++ it supports special syntax for recursion of the entire pattern, and ++ also for individual subpattern recursion. After its introduction in ++ PCRE and Python, this kind of recursion was subsequently introduced + into Perl at release 5.10. + +- A special item that consists of (? followed by a number greater than +- zero and a closing parenthesis is a recursive subroutine call of the +- subpattern of the given number, provided that it occurs inside that +- subpattern. (If not, it is a non-recursive subroutine call, which is +- described in the next section.) The special item (?R) or (?0) is a ++ A special item that consists of (? followed by a number greater than ++ zero and a closing parenthesis is a recursive subroutine call of the ++ subpattern of the given number, provided that it occurs inside that ++ subpattern. (If not, it is a non-recursive subroutine call, which is ++ described in the next section.) The special item (?R) or (?0) is a + recursive call of the entire regular expression. + +- This PCRE pattern solves the nested parentheses problem (assume the ++ This PCRE pattern solves the nested parentheses problem (assume the + PCRE_EXTENDED option is set so that white space is ignored): + + \( ( [^()]++ | (?R) )* \) + +- First it matches an opening parenthesis. Then it matches any number of +- substrings which can either be a sequence of non-parentheses, or a +- recursive match of the pattern itself (that is, a correctly parenthe- ++ First it matches an opening parenthesis. Then it matches any number of ++ substrings which can either be a sequence of non-parentheses, or a ++ recursive match of the pattern itself (that is, a correctly parenthe- + sized substring). Finally there is a closing parenthesis. Note the use + of a possessive quantifier to avoid backtracking into sequences of non- + parentheses. + +- If this were part of a larger pattern, you would not want to recurse ++ If this were part of a larger pattern, you would not want to recurse + the entire pattern, so instead you could use this: + + ( \( ( [^()]++ | (?1) )* \) ) + +- We have put the pattern into parentheses, and caused the recursion to ++ We have put the pattern into parentheses, and caused the recursion to + refer to them instead of the whole pattern. + +- In a larger pattern, keeping track of parenthesis numbers can be +- tricky. This is made easier by the use of relative references. Instead ++ In a larger pattern, keeping track of parenthesis numbers can be ++ tricky. This is made easier by the use of relative references. Instead + of (?1) in the pattern above you can write (?-2) to refer to the second +- most recently opened parentheses preceding the recursion. In other +- words, a negative number counts capturing parentheses leftwards from ++ most recently opened parentheses preceding the recursion. In other ++ words, a negative number counts capturing parentheses leftwards from + the point at which it is encountered. + +- It is also possible to refer to subsequently opened parentheses, by +- writing references such as (?+2). However, these cannot be recursive +- because the reference is not inside the parentheses that are refer- +- enced. They are always non-recursive subroutine calls, as described in ++ It is also possible to refer to subsequently opened parentheses, by ++ writing references such as (?+2). However, these cannot be recursive ++ because the reference is not inside the parentheses that are refer- ++ enced. They are always non-recursive subroutine calls, as described in + the next section. + +- An alternative approach is to use named parentheses instead. The Perl +- syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also ++ An alternative approach is to use named parentheses instead. The Perl ++ syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also + supported. We could rewrite the above example as follows: + + (? \( ( [^()]++ | (?&pn) )* \) ) + +- If there is more than one subpattern with the same name, the earliest ++ If there is more than one subpattern with the same name, the earliest + one is used. + +- This particular example pattern that we have been looking at contains ++ This particular example pattern that we have been looking at contains + nested unlimited repeats, and so the use of a possessive quantifier for + matching strings of non-parentheses is important when applying the pat- +- tern to strings that do not match. For example, when this pattern is ++ tern to strings that do not match. For example, when this pattern is + applied to + + (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa() + +- it yields "no match" quickly. However, if a possessive quantifier is +- not used, the match runs for a very long time indeed because there are +- so many different ways the + and * repeats can carve up the subject, ++ it yields "no match" quickly. However, if a possessive quantifier is ++ not used, the match runs for a very long time indeed because there are ++ so many different ways the + and * repeats can carve up the subject, + and all have to be tested before failure can be reported. + +- At the end of a match, the values of capturing parentheses are those +- from the outermost level. If you want to obtain intermediate values, a +- callout function can be used (see below and the pcrecallout documenta- ++ At the end of a match, the values of capturing parentheses are those ++ from the outermost level. If you want to obtain intermediate values, a ++ callout function can be used (see below and the pcrecallout documenta- + tion). If the pattern above is matched against + + (ab(cd)ef) + +- the value for the inner capturing parentheses (numbered 2) is "ef", +- which is the last value taken on at the top level. If a capturing sub- +- pattern is not matched at the top level, its final captured value is +- unset, even if it was (temporarily) set at a deeper level during the ++ the value for the inner capturing parentheses (numbered 2) is "ef", ++ which is the last value taken on at the top level. If a capturing sub- ++ pattern is not matched at the top level, its final captured value is ++ unset, even if it was (temporarily) set at a deeper level during the + matching process. + +- If there are more than 15 capturing parentheses in a pattern, PCRE has +- to obtain extra memory to store data during a recursion, which it does ++ If there are more than 15 capturing parentheses in a pattern, PCRE has ++ to obtain extra memory to store data during a recursion, which it does + by using pcre_malloc, freeing it via pcre_free afterwards. If no memory + can be obtained, the match fails with the PCRE_ERROR_NOMEMORY error. + +- Do not confuse the (?R) item with the condition (R), which tests for +- recursion. Consider this pattern, which matches text in angle brack- +- ets, allowing for arbitrary nesting. Only digits are allowed in nested +- brackets (that is, when recursing), whereas any characters are permit- ++ Do not confuse the (?R) item with the condition (R), which tests for ++ recursion. Consider this pattern, which matches text in angle brack- ++ ets, allowing for arbitrary nesting. Only digits are allowed in nested ++ brackets (that is, when recursing), whereas any characters are permit- + ted at the outer level. + + < (?: (?(R) \d++ | [^<>]*+) | (?R)) * > + +- In this pattern, (?(R) is the start of a conditional subpattern, with +- two different alternatives for the recursive and non-recursive cases. ++ In this pattern, (?(R) is the start of a conditional subpattern, with ++ two different alternatives for the recursive and non-recursive cases. + The (?R) item is the actual recursive call. + + Differences in recursion processing between PCRE and Perl + +- Recursion processing in PCRE differs from Perl in two important ways. +- In PCRE (like Python, but unlike Perl), a recursive subpattern call is ++ Recursion processing in PCRE differs from Perl in two important ways. ++ In PCRE (like Python, but unlike Perl), a recursive subpattern call is + always treated as an atomic group. That is, once it has matched some of + the subject string, it is never re-entered, even if it contains untried +- alternatives and there is a subsequent matching failure. This can be +- illustrated by the following pattern, which purports to match a palin- +- dromic string that contains an odd number of characters (for example, ++ alternatives and there is a subsequent matching failure. This can be ++ illustrated by the following pattern, which purports to match a palin- ++ dromic string that contains an odd number of characters (for example, + "a", "aba", "abcba", "abcdcba"): + + ^(.|(.)(?1)\2)$ + + The idea is that it either matches a single character, or two identical +- characters surrounding a sub-palindrome. In Perl, this pattern works; +- in PCRE it does not if the pattern is longer than three characters. ++ characters surrounding a sub-palindrome. In Perl, this pattern works; ++ in PCRE it does not if the pattern is longer than three characters. + Consider the subject string "abcba": + +- At the top level, the first character is matched, but as it is not at ++ At the top level, the first character is matched, but as it is not at + the end of the string, the first alternative fails; the second alterna- + tive is taken and the recursion kicks in. The recursive call to subpat- +- tern 1 successfully matches the next character ("b"). (Note that the ++ tern 1 successfully matches the next character ("b"). (Note that the + beginning and end of line tests are not part of the recursion). + +- Back at the top level, the next character ("c") is compared with what +- subpattern 2 matched, which was "a". This fails. Because the recursion +- is treated as an atomic group, there are now no backtracking points, +- and so the entire match fails. (Perl is able, at this point, to re- +- enter the recursion and try the second alternative.) However, if the ++ Back at the top level, the next character ("c") is compared with what ++ subpattern 2 matched, which was "a". This fails. Because the recursion ++ is treated as an atomic group, there are now no backtracking points, ++ and so the entire match fails. (Perl is able, at this point, to re- ++ enter the recursion and try the second alternative.) However, if the + pattern is written with the alternatives in the other order, things are + different: + + ^((.)(?1)\2|.)$ + +- This time, the recursing alternative is tried first, and continues to +- recurse until it runs out of characters, at which point the recursion +- fails. But this time we do have another alternative to try at the +- higher level. That is the big difference: in the previous case the ++ This time, the recursing alternative is tried first, and continues to ++ recurse until it runs out of characters, at which point the recursion ++ fails. But this time we do have another alternative to try at the ++ higher level. That is the big difference: in the previous case the + remaining alternative is at a deeper recursion level, which PCRE cannot + use. + +- To change the pattern so that it matches all palindromic strings, not +- just those with an odd number of characters, it is tempting to change ++ To change the pattern so that it matches all palindromic strings, not ++ just those with an odd number of characters, it is tempting to change + the pattern to this: + + ^((.)(?1)\2|.?)$ + +- Again, this works in Perl, but not in PCRE, and for the same reason. +- When a deeper recursion has matched a single character, it cannot be +- entered again in order to match an empty string. The solution is to +- separate the two cases, and write out the odd and even cases as alter- ++ Again, this works in Perl, but not in PCRE, and for the same reason. ++ When a deeper recursion has matched a single character, it cannot be ++ entered again in order to match an empty string. The solution is to ++ separate the two cases, and write out the odd and even cases as alter- + natives at the higher level: + + ^(?:((.)(?1)\2|)|((.)(?3)\4|.)) + +- If you want to match typical palindromic phrases, the pattern has to ++ If you want to match typical palindromic phrases, the pattern has to + ignore all non-word characters, which can be done like this: + + ^\W*+(?:((.)\W*+(?1)\W*+\2|)|((.)\W*+(?3)\W*+\4|\W*+.\W*+))\W*+$ + + If run with the PCRE_CASELESS option, this pattern matches phrases such + as "A man, a plan, a canal: Panama!" and it works well in both PCRE and +- Perl. Note the use of the possessive quantifier *+ to avoid backtrack- +- ing into sequences of non-word characters. Without this, PCRE takes a +- great deal longer (ten times or more) to match typical phrases, and ++ Perl. Note the use of the possessive quantifier *+ to avoid backtrack- ++ ing into sequences of non-word characters. Without this, PCRE takes a ++ great deal longer (ten times or more) to match typical phrases, and + Perl takes so long that you think it has gone into a loop. + +- WARNING: The palindrome-matching patterns above work only if the sub- +- ject string does not start with a palindrome that is shorter than the +- entire string. For example, although "abcba" is correctly matched, if +- the subject is "ababa", PCRE finds the palindrome "aba" at the start, +- then fails at top level because the end of the string does not follow. +- Once again, it cannot jump back into the recursion to try other alter- ++ WARNING: The palindrome-matching patterns above work only if the sub- ++ ject string does not start with a palindrome that is shorter than the ++ entire string. For example, although "abcba" is correctly matched, if ++ the subject is "ababa", PCRE finds the palindrome "aba" at the start, ++ then fails at top level because the end of the string does not follow. ++ Once again, it cannot jump back into the recursion to try other alter- + natives, so the entire match fails. + +- The second way in which PCRE and Perl differ in their recursion pro- +- cessing is in the handling of captured values. In Perl, when a subpat- +- tern is called recursively or as a subpattern (see the next section), +- it has no access to any values that were captured outside the recur- +- sion, whereas in PCRE these values can be referenced. Consider this ++ The second way in which PCRE and Perl differ in their recursion pro- ++ cessing is in the handling of captured values. In Perl, when a subpat- ++ tern is called recursively or as a subpattern (see the next section), ++ it has no access to any values that were captured outside the recur- ++ sion, whereas in PCRE these values can be referenced. Consider this + pattern: + + ^(.)(\1|a(?2)) + +- In PCRE, this pattern matches "bab". The first capturing parentheses +- match "b", then in the second group, when the back reference \1 fails +- to match "b", the second alternative matches "a" and then recurses. In +- the recursion, \1 does now match "b" and so the whole match succeeds. +- In Perl, the pattern fails to match because inside the recursive call ++ In PCRE, this pattern matches "bab". The first capturing parentheses ++ match "b", then in the second group, when the back reference \1 fails ++ to match "b", the second alternative matches "a" and then recurses. In ++ the recursion, \1 does now match "b" and so the whole match succeeds. ++ In Perl, the pattern fails to match because inside the recursive call + \1 cannot access the externally set value. + + + SUBPATTERNS AS SUBROUTINES + +- If the syntax for a recursive subpattern call (either by number or by +- name) is used outside the parentheses to which it refers, it operates +- like a subroutine in a programming language. The called subpattern may +- be defined before or after the reference. A numbered reference can be ++ If the syntax for a recursive subpattern call (either by number or by ++ name) is used outside the parentheses to which it refers, it operates ++ like a subroutine in a programming language. The called subpattern may ++ be defined before or after the reference. A numbered reference can be + absolute or relative, as in these examples: + + (...(absolute)...)...(?2)... +@@ -7148,79 +7172,79 @@ SUBPATTERNS AS SUBROUTINES + + (sens|respons)e and \1ibility + +- matches "sense and sensibility" and "response and responsibility", but ++ matches "sense and sensibility" and "response and responsibility", but + not "sense and responsibility". If instead the pattern + + (sens|respons)e and (?1)ibility + +- is used, it does match "sense and responsibility" as well as the other +- two strings. Another example is given in the discussion of DEFINE ++ is used, it does match "sense and responsibility" as well as the other ++ two strings. Another example is given in the discussion of DEFINE + above. + +- All subroutine calls, whether recursive or not, are always treated as +- atomic groups. That is, once a subroutine has matched some of the sub- ++ All subroutine calls, whether recursive or not, are always treated as ++ atomic groups. That is, once a subroutine has matched some of the sub- + ject string, it is never re-entered, even if it contains untried alter- +- natives and there is a subsequent matching failure. Any capturing +- parentheses that are set during the subroutine call revert to their ++ natives and there is a subsequent matching failure. Any capturing ++ parentheses that are set during the subroutine call revert to their + previous values afterwards. + +- Processing options such as case-independence are fixed when a subpat- +- tern is defined, so if it is used as a subroutine, such options cannot ++ Processing options such as case-independence are fixed when a subpat- ++ tern is defined, so if it is used as a subroutine, such options cannot + be changed for different calls. For example, consider this pattern: + + (abc)(?i:(?-1)) + +- It matches "abcabc". It does not match "abcABC" because the change of ++ It matches "abcabc". It does not match "abcABC" because the change of + processing option does not affect the called subpattern. + + + ONIGURUMA SUBROUTINE SYNTAX + +- For compatibility with Oniguruma, the non-Perl syntax \g followed by a ++ For compatibility with Oniguruma, the non-Perl syntax \g followed by a + name or a number enclosed either in angle brackets or single quotes, is +- an alternative syntax for referencing a subpattern as a subroutine, +- possibly recursively. Here are two of the examples used above, rewrit- ++ an alternative syntax for referencing a subpattern as a subroutine, ++ possibly recursively. Here are two of the examples used above, rewrit- + ten using this syntax: + + (? \( ( (?>[^()]+) | \g )* \) ) + (sens|respons)e and \g'1'ibility + +- PCRE supports an extension to Oniguruma: if a number is preceded by a ++ PCRE supports an extension to Oniguruma: if a number is preceded by a + plus or a minus sign it is taken as a relative reference. For example: + + (abc)(?i:\g<-1>) + +- Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not +- synonymous. The former is a back reference; the latter is a subroutine ++ Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not ++ synonymous. The former is a back reference; the latter is a subroutine + call. + + + CALLOUTS + + Perl has a feature whereby using the sequence (?{...}) causes arbitrary +- Perl code to be obeyed in the middle of matching a regular expression. ++ Perl code to be obeyed in the middle of matching a regular expression. + This makes it possible, amongst other things, to extract different sub- + strings that match the same pair of parentheses when there is a repeti- + tion. + + PCRE provides a similar feature, but of course it cannot obey arbitrary + Perl code. The feature is called "callout". The caller of PCRE provides +- an external function by putting its entry point in the global variable +- pcre_callout (8-bit library) or pcre[16|32]_callout (16-bit or 32-bit +- library). By default, this variable contains NULL, which disables all ++ an external function by putting its entry point in the global variable ++ pcre_callout (8-bit library) or pcre[16|32]_callout (16-bit or 32-bit ++ library). By default, this variable contains NULL, which disables all + calling out. + +- Within a regular expression, (?C) indicates the points at which the +- external function is to be called. If you want to identify different +- callout points, you can put a number less than 256 after the letter C. +- The default value is zero. For example, this pattern has two callout ++ Within a regular expression, (?C) indicates the points at which the ++ external function is to be called. If you want to identify different ++ callout points, you can put a number less than 256 after the letter C. ++ The default value is zero. For example, this pattern has two callout + points: + + (?C1)abc(?C2)def + +- If the PCRE_AUTO_CALLOUT flag is passed to a compiling function, call- +- outs are automatically installed before each item in the pattern. They +- are all numbered 255. If there is a conditional group in the pattern ++ If the PCRE_AUTO_CALLOUT flag is passed to a compiling function, call- ++ outs are automatically installed before each item in the pattern. They ++ are all numbered 255. If there is a conditional group in the pattern + whose condition is an assertion, an additional callout is inserted just + before the condition. An explicit callout may also be set at this posi- + tion, as in this example: +@@ -7230,120 +7254,120 @@ CALLOUTS + Note that this applies only to assertion conditions, not to other types + of condition. + +- During matching, when PCRE reaches a callout point, the external func- +- tion is called. It is provided with the number of the callout, the +- position in the pattern, and, optionally, one item of data originally +- supplied by the caller of the matching function. The callout function ++ During matching, when PCRE reaches a callout point, the external func- ++ tion is called. It is provided with the number of the callout, the ++ position in the pattern, and, optionally, one item of data originally ++ supplied by the caller of the matching function. The callout function + may cause matching to proceed, to backtrack, or to fail altogether. + +- By default, PCRE implements a number of optimizations at compile time +- and matching time, and one side-effect is that sometimes callouts are +- skipped. If you need all possible callouts to happen, you need to set +- options that disable the relevant optimizations. More details, and a +- complete description of the interface to the callout function, are ++ By default, PCRE implements a number of optimizations at compile time ++ and matching time, and one side-effect is that sometimes callouts are ++ skipped. If you need all possible callouts to happen, you need to set ++ options that disable the relevant optimizations. More details, and a ++ complete description of the interface to the callout function, are + given in the pcrecallout documentation. + + + BACKTRACKING CONTROL + +- Perl 5.10 introduced a number of "Special Backtracking Control Verbs", +- which are still described in the Perl documentation as "experimental +- and subject to change or removal in a future version of Perl". It goes +- on to say: "Their usage in production code should be noted to avoid +- problems during upgrades." The same remarks apply to the PCRE features ++ Perl 5.10 introduced a number of "Special Backtracking Control Verbs", ++ which are still described in the Perl documentation as "experimental ++ and subject to change or removal in a future version of Perl". It goes ++ on to say: "Their usage in production code should be noted to avoid ++ problems during upgrades." The same remarks apply to the PCRE features + described in this section. + +- The new verbs make use of what was previously invalid syntax: an open- ++ The new verbs make use of what was previously invalid syntax: an open- + ing parenthesis followed by an asterisk. They are generally of the form +- (*VERB) or (*VERB:NAME). Some may take either form, possibly behaving +- differently depending on whether or not a name is present. A name is ++ (*VERB) or (*VERB:NAME). Some may take either form, possibly behaving ++ differently depending on whether or not a name is present. A name is + any sequence of characters that does not include a closing parenthesis. + The maximum length of name is 255 in the 8-bit library and 65535 in the +- 16-bit and 32-bit libraries. If the name is empty, that is, if the +- closing parenthesis immediately follows the colon, the effect is as if +- the colon were not there. Any number of these verbs may occur in a ++ 16-bit and 32-bit libraries. If the name is empty, that is, if the ++ closing parenthesis immediately follows the colon, the effect is as if ++ the colon were not there. Any number of these verbs may occur in a + pattern. + +- Since these verbs are specifically related to backtracking, most of +- them can be used only when the pattern is to be matched using one of +- the traditional matching functions, because these use a backtracking +- algorithm. With the exception of (*FAIL), which behaves like a failing +- negative assertion, the backtracking control verbs cause an error if ++ Since these verbs are specifically related to backtracking, most of ++ them can be used only when the pattern is to be matched using one of ++ the traditional matching functions, because these use a backtracking ++ algorithm. With the exception of (*FAIL), which behaves like a failing ++ negative assertion, the backtracking control verbs cause an error if + encountered by a DFA matching function. + +- The behaviour of these verbs in repeated groups, assertions, and in ++ The behaviour of these verbs in repeated groups, assertions, and in + subpatterns called as subroutines (whether or not recursively) is docu- + mented below. + + Optimizations that affect backtracking verbs + +- PCRE contains some optimizations that are used to speed up matching by ++ PCRE contains some optimizations that are used to speed up matching by + running some checks at the start of each match attempt. For example, it +- may know the minimum length of matching subject, or that a particular ++ may know the minimum length of matching subject, or that a particular + character must be present. When one of these optimizations bypasses the +- running of a match, any included backtracking verbs will not, of ++ running of a match, any included backtracking verbs will not, of + course, be processed. You can suppress the start-of-match optimizations +- by setting the PCRE_NO_START_OPTIMIZE option when calling pcre_com- ++ by setting the PCRE_NO_START_OPTIMIZE option when calling pcre_com- + pile() or pcre_exec(), or by starting the pattern with (*NO_START_OPT). + There is more discussion of this option in the section entitled "Option + bits for pcre_exec()" in the pcreapi documentation. + +- Experiments with Perl suggest that it too has similar optimizations, ++ Experiments with Perl suggest that it too has similar optimizations, + sometimes leading to anomalous results. + + Verbs that act immediately + +- The following verbs act as soon as they are encountered. They may not ++ The following verbs act as soon as they are encountered. They may not + be followed by a name. + + (*ACCEPT) + +- This verb causes the match to end successfully, skipping the remainder +- of the pattern. However, when it is inside a subpattern that is called +- as a subroutine, only that subpattern is ended successfully. Matching ++ This verb causes the match to end successfully, skipping the remainder ++ of the pattern. However, when it is inside a subpattern that is called ++ as a subroutine, only that subpattern is ended successfully. Matching + then continues at the outer level. If (*ACCEPT) in triggered in a posi- +- tive assertion, the assertion succeeds; in a negative assertion, the ++ tive assertion, the assertion succeeds; in a negative assertion, the + assertion fails. + +- If (*ACCEPT) is inside capturing parentheses, the data so far is cap- ++ If (*ACCEPT) is inside capturing parentheses, the data so far is cap- + tured. For example: + + A((?:A|B(*ACCEPT)|C)D) + +- This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap- ++ This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap- + tured by the outer parentheses. + + (*FAIL) or (*F) + +- This verb causes a matching failure, forcing backtracking to occur. It +- is equivalent to (?!) but easier to read. The Perl documentation notes +- that it is probably useful only when combined with (?{}) or (??{}). +- Those are, of course, Perl features that are not present in PCRE. The +- nearest equivalent is the callout feature, as for example in this pat- ++ This verb causes a matching failure, forcing backtracking to occur. It ++ is equivalent to (?!) but easier to read. The Perl documentation notes ++ that it is probably useful only when combined with (?{}) or (??{}). ++ Those are, of course, Perl features that are not present in PCRE. The ++ nearest equivalent is the callout feature, as for example in this pat- + tern: + + a+(?C)(*FAIL) + +- A match with the string "aaaa" always fails, but the callout is taken ++ A match with the string "aaaa" always fails, but the callout is taken + before each backtrack happens (in this example, 10 times). + + Recording which path was taken + +- There is one verb whose main purpose is to track how a match was +- arrived at, though it also has a secondary use in conjunction with ++ There is one verb whose main purpose is to track how a match was ++ arrived at, though it also has a secondary use in conjunction with + advancing the match starting point (see (*SKIP) below). + + (*MARK:NAME) or (*:NAME) + +- A name is always required with this verb. There may be as many +- instances of (*MARK) as you like in a pattern, and their names do not ++ A name is always required with this verb. There may be as many ++ instances of (*MARK) as you like in a pattern, and their names do not + have to be unique. + +- When a match succeeds, the name of the last-encountered (*MARK:NAME), +- (*PRUNE:NAME), or (*THEN:NAME) on the matching path is passed back to +- the caller as described in the section entitled "Extra data for +- pcre_exec()" in the pcreapi documentation. Here is an example of +- pcretest output, where the /K modifier requests the retrieval and out- ++ When a match succeeds, the name of the last-encountered (*MARK:NAME), ++ (*PRUNE:NAME), or (*THEN:NAME) on the matching path is passed back to ++ the caller as described in the section entitled "Extra data for ++ pcre_exec()" in the pcreapi documentation. Here is an example of ++ pcretest output, where the /K modifier requests the retrieval and out- + putting of (*MARK) data: + + re> /X(*MARK:A)Y|X(*MARK:B)Z/K +@@ -7355,73 +7379,73 @@ BACKTRACKING CONTROL + MK: B + + The (*MARK) name is tagged with "MK:" in this output, and in this exam- +- ple it indicates which of the two alternatives matched. This is a more +- efficient way of obtaining this information than putting each alterna- ++ ple it indicates which of the two alternatives matched. This is a more ++ efficient way of obtaining this information than putting each alterna- + tive in its own capturing parentheses. + +- If a verb with a name is encountered in a positive assertion that is +- true, the name is recorded and passed back if it is the last-encoun- ++ If a verb with a name is encountered in a positive assertion that is ++ true, the name is recorded and passed back if it is the last-encoun- + tered. This does not happen for negative assertions or failing positive + assertions. + +- After a partial match or a failed match, the last encountered name in ++ After a partial match or a failed match, the last encountered name in + the entire match process is returned. For example: + + re> /X(*MARK:A)Y|X(*MARK:B)Z/K + data> XP + No match, mark = B + +- Note that in this unanchored example the mark is retained from the ++ Note that in this unanchored example the mark is retained from the + match attempt that started at the letter "X" in the subject. Subsequent + match attempts starting at "P" and then with an empty string do not get + as far as the (*MARK) item, but nevertheless do not reset it. + +- If you are interested in (*MARK) values after failed matches, you +- should probably set the PCRE_NO_START_OPTIMIZE option (see above) to ++ If you are interested in (*MARK) values after failed matches, you ++ should probably set the PCRE_NO_START_OPTIMIZE option (see above) to + ensure that the match is always attempted. + + Verbs that act after backtracking + + The following verbs do nothing when they are encountered. Matching con- +- tinues with what follows, but if there is no subsequent match, causing +- a backtrack to the verb, a failure is forced. That is, backtracking +- cannot pass to the left of the verb. However, when one of these verbs ++ tinues with what follows, but if there is no subsequent match, causing ++ a backtrack to the verb, a failure is forced. That is, backtracking ++ cannot pass to the left of the verb. However, when one of these verbs + appears inside an atomic group or an assertion that is true, its effect +- is confined to that group, because once the group has been matched, +- there is never any backtracking into it. In this situation, backtrack- +- ing can "jump back" to the left of the entire atomic group or asser- +- tion. (Remember also, as stated above, that this localization also ++ is confined to that group, because once the group has been matched, ++ there is never any backtracking into it. In this situation, backtrack- ++ ing can "jump back" to the left of the entire atomic group or asser- ++ tion. (Remember also, as stated above, that this localization also + applies in subroutine calls.) + +- These verbs differ in exactly what kind of failure occurs when back- +- tracking reaches them. The behaviour described below is what happens +- when the verb is not in a subroutine or an assertion. Subsequent sec- ++ These verbs differ in exactly what kind of failure occurs when back- ++ tracking reaches them. The behaviour described below is what happens ++ when the verb is not in a subroutine or an assertion. Subsequent sec- + tions cover these special cases. + + (*COMMIT) + +- This verb, which may not be followed by a name, causes the whole match ++ This verb, which may not be followed by a name, causes the whole match + to fail outright if there is a later matching failure that causes back- +- tracking to reach it. Even if the pattern is unanchored, no further ++ tracking to reach it. Even if the pattern is unanchored, no further + attempts to find a match by advancing the starting point take place. If +- (*COMMIT) is the only backtracking verb that is encountered, once it ++ (*COMMIT) is the only backtracking verb that is encountered, once it + has been passed pcre_exec() is committed to finding a match at the cur- + rent starting point, or not at all. For example: + + a+(*COMMIT)b + +- This matches "xxaab" but not "aacaab". It can be thought of as a kind ++ This matches "xxaab" but not "aacaab". It can be thought of as a kind + of dynamic anchor, or "I've started, so I must finish." The name of the +- most recently passed (*MARK) in the path is passed back when (*COMMIT) ++ most recently passed (*MARK) in the path is passed back when (*COMMIT) + forces a match failure. + +- If there is more than one backtracking verb in a pattern, a different +- one that follows (*COMMIT) may be triggered first, so merely passing ++ If there is more than one backtracking verb in a pattern, a different ++ one that follows (*COMMIT) may be triggered first, so merely passing + (*COMMIT) during a match does not always guarantee that a match must be + at this starting point. + +- Note that (*COMMIT) at the start of a pattern is not the same as an +- anchor, unless PCRE's start-of-match optimizations are turned off, as ++ Note that (*COMMIT) at the start of a pattern is not the same as an ++ anchor, unless PCRE's start-of-match optimizations are turned off, as + shown in this output from pcretest: + + re> /(*COMMIT)abc/ +@@ -7432,207 +7456,207 @@ BACKTRACKING CONTROL + + For this pattern, PCRE knows that any match must start with "a", so the + optimization skips along the subject to "a" before applying the pattern +- to the first set of data. The match attempt then succeeds. In the sec- +- ond set of data, the escape sequence \Y is interpreted by the pcretest +- program. It causes the PCRE_NO_START_OPTIMIZE option to be set when ++ to the first set of data. The match attempt then succeeds. In the sec- ++ ond set of data, the escape sequence \Y is interpreted by the pcretest ++ program. It causes the PCRE_NO_START_OPTIMIZE option to be set when + pcre_exec() is called. This disables the optimization that skips along + to the first character. The pattern is now applied starting at "x", and +- so the (*COMMIT) causes the match to fail without trying any other ++ so the (*COMMIT) causes the match to fail without trying any other + starting points. + + (*PRUNE) or (*PRUNE:NAME) + +- This verb causes the match to fail at the current starting position in ++ This verb causes the match to fail at the current starting position in + the subject if there is a later matching failure that causes backtrack- +- ing to reach it. If the pattern is unanchored, the normal "bumpalong" +- advance to the next starting character then happens. Backtracking can +- occur as usual to the left of (*PRUNE), before it is reached, or when +- matching to the right of (*PRUNE), but if there is no match to the +- right, backtracking cannot cross (*PRUNE). In simple cases, the use of +- (*PRUNE) is just an alternative to an atomic group or possessive quan- ++ ing to reach it. If the pattern is unanchored, the normal "bumpalong" ++ advance to the next starting character then happens. Backtracking can ++ occur as usual to the left of (*PRUNE), before it is reached, or when ++ matching to the right of (*PRUNE), but if there is no match to the ++ right, backtracking cannot cross (*PRUNE). In simple cases, the use of ++ (*PRUNE) is just an alternative to an atomic group or possessive quan- + tifier, but there are some uses of (*PRUNE) that cannot be expressed in +- any other way. In an anchored pattern (*PRUNE) has the same effect as ++ any other way. In an anchored pattern (*PRUNE) has the same effect as + (*COMMIT). + + The behaviour of (*PRUNE:NAME) is the not the same as +- (*MARK:NAME)(*PRUNE). It is like (*MARK:NAME) in that the name is +- remembered for passing back to the caller. However, (*SKIP:NAME) ++ (*MARK:NAME)(*PRUNE). It is like (*MARK:NAME) in that the name is ++ remembered for passing back to the caller. However, (*SKIP:NAME) + searches only for names set with (*MARK). + + (*SKIP) + +- This verb, when given without a name, is like (*PRUNE), except that if +- the pattern is unanchored, the "bumpalong" advance is not to the next ++ This verb, when given without a name, is like (*PRUNE), except that if ++ the pattern is unanchored, the "bumpalong" advance is not to the next + character, but to the position in the subject where (*SKIP) was encoun- +- tered. (*SKIP) signifies that whatever text was matched leading up to ++ tered. (*SKIP) signifies that whatever text was matched leading up to + it cannot be part of a successful match. Consider: + + a+(*SKIP)b + +- If the subject is "aaaac...", after the first match attempt fails +- (starting at the first character in the string), the starting point ++ If the subject is "aaaac...", after the first match attempt fails ++ (starting at the first character in the string), the starting point + skips on to start the next attempt at "c". Note that a possessive quan- +- tifer does not have the same effect as this example; although it would +- suppress backtracking during the first match attempt, the second +- attempt would start at the second character instead of skipping on to ++ tifer does not have the same effect as this example; although it would ++ suppress backtracking during the first match attempt, the second ++ attempt would start at the second character instead of skipping on to + "c". + + (*SKIP:NAME) + + When (*SKIP) has an associated name, its behaviour is modified. When it + is triggered, the previous path through the pattern is searched for the +- most recent (*MARK) that has the same name. If one is found, the ++ most recent (*MARK) that has the same name. If one is found, the + "bumpalong" advance is to the subject position that corresponds to that + (*MARK) instead of to where (*SKIP) was encountered. If no (*MARK) with + a matching name is found, the (*SKIP) is ignored. + +- Note that (*SKIP:NAME) searches only for names set by (*MARK:NAME). It ++ Note that (*SKIP:NAME) searches only for names set by (*MARK:NAME). It + ignores names that are set by (*PRUNE:NAME) or (*THEN:NAME). + + (*THEN) or (*THEN:NAME) + +- This verb causes a skip to the next innermost alternative when back- +- tracking reaches it. That is, it cancels any further backtracking +- within the current alternative. Its name comes from the observation ++ This verb causes a skip to the next innermost alternative when back- ++ tracking reaches it. That is, it cancels any further backtracking ++ within the current alternative. Its name comes from the observation + that it can be used for a pattern-based if-then-else block: + + ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ... + +- If the COND1 pattern matches, FOO is tried (and possibly further items +- after the end of the group if FOO succeeds); on failure, the matcher +- skips to the second alternative and tries COND2, without backtracking +- into COND1. If that succeeds and BAR fails, COND3 is tried. If subse- +- quently BAZ fails, there are no more alternatives, so there is a back- +- track to whatever came before the entire group. If (*THEN) is not ++ If the COND1 pattern matches, FOO is tried (and possibly further items ++ after the end of the group if FOO succeeds); on failure, the matcher ++ skips to the second alternative and tries COND2, without backtracking ++ into COND1. If that succeeds and BAR fails, COND3 is tried. If subse- ++ quently BAZ fails, there are no more alternatives, so there is a back- ++ track to whatever came before the entire group. If (*THEN) is not + inside an alternation, it acts like (*PRUNE). + +- The behaviour of (*THEN:NAME) is the not the same as +- (*MARK:NAME)(*THEN). It is like (*MARK:NAME) in that the name is +- remembered for passing back to the caller. However, (*SKIP:NAME) ++ The behaviour of (*THEN:NAME) is the not the same as ++ (*MARK:NAME)(*THEN). It is like (*MARK:NAME) in that the name is ++ remembered for passing back to the caller. However, (*SKIP:NAME) + searches only for names set with (*MARK). + +- A subpattern that does not contain a | character is just a part of the +- enclosing alternative; it is not a nested alternation with only one +- alternative. The effect of (*THEN) extends beyond such a subpattern to +- the enclosing alternative. Consider this pattern, where A, B, etc. are +- complex pattern fragments that do not contain any | characters at this ++ A subpattern that does not contain a | character is just a part of the ++ enclosing alternative; it is not a nested alternation with only one ++ alternative. The effect of (*THEN) extends beyond such a subpattern to ++ the enclosing alternative. Consider this pattern, where A, B, etc. are ++ complex pattern fragments that do not contain any | characters at this + level: + + A (B(*THEN)C) | D + +- If A and B are matched, but there is a failure in C, matching does not ++ If A and B are matched, but there is a failure in C, matching does not + backtrack into A; instead it moves to the next alternative, that is, D. +- However, if the subpattern containing (*THEN) is given an alternative, ++ However, if the subpattern containing (*THEN) is given an alternative, + it behaves differently: + + A (B(*THEN)C | (*FAIL)) | D + +- The effect of (*THEN) is now confined to the inner subpattern. After a ++ The effect of (*THEN) is now confined to the inner subpattern. After a + failure in C, matching moves to (*FAIL), which causes the whole subpat- +- tern to fail because there are no more alternatives to try. In this ++ tern to fail because there are no more alternatives to try. In this + case, matching does now backtrack into A. + +- Note that a conditional subpattern is not considered as having two +- alternatives, because only one is ever used. In other words, the | ++ Note that a conditional subpattern is not considered as having two ++ alternatives, because only one is ever used. In other words, the | + character in a conditional subpattern has a different meaning. Ignoring + white space, consider: + + ^.*? (?(?=a) a | b(*THEN)c ) + +- If the subject is "ba", this pattern does not match. Because .*? is +- ungreedy, it initially matches zero characters. The condition (?=a) +- then fails, the character "b" is matched, but "c" is not. At this +- point, matching does not backtrack to .*? as might perhaps be expected +- from the presence of the | character. The conditional subpattern is ++ If the subject is "ba", this pattern does not match. Because .*? is ++ ungreedy, it initially matches zero characters. The condition (?=a) ++ then fails, the character "b" is matched, but "c" is not. At this ++ point, matching does not backtrack to .*? as might perhaps be expected ++ from the presence of the | character. The conditional subpattern is + part of the single alternative that comprises the whole pattern, and so +- the match fails. (If there was a backtrack into .*?, allowing it to ++ the match fails. (If there was a backtrack into .*?, allowing it to + match "b", the match would succeed.) + +- The verbs just described provide four different "strengths" of control ++ The verbs just described provide four different "strengths" of control + when subsequent matching fails. (*THEN) is the weakest, carrying on the +- match at the next alternative. (*PRUNE) comes next, failing the match +- at the current starting position, but allowing an advance to the next +- character (for an unanchored pattern). (*SKIP) is similar, except that ++ match at the next alternative. (*PRUNE) comes next, failing the match ++ at the current starting position, but allowing an advance to the next ++ character (for an unanchored pattern). (*SKIP) is similar, except that + the advance may be more than one character. (*COMMIT) is the strongest, + causing the entire match to fail. + + More than one backtracking verb + +- If more than one backtracking verb is present in a pattern, the one +- that is backtracked onto first acts. For example, consider this pat- ++ If more than one backtracking verb is present in a pattern, the one ++ that is backtracked onto first acts. For example, consider this pat- + tern, where A, B, etc. are complex pattern fragments: + + (A(*COMMIT)B(*THEN)C|ABD) + +- If A matches but B fails, the backtrack to (*COMMIT) causes the entire ++ If A matches but B fails, the backtrack to (*COMMIT) causes the entire + match to fail. However, if A and B match, but C fails, the backtrack to +- (*THEN) causes the next alternative (ABD) to be tried. This behaviour +- is consistent, but is not always the same as Perl's. It means that if +- two or more backtracking verbs appear in succession, all the the last ++ (*THEN) causes the next alternative (ABD) to be tried. This behaviour ++ is consistent, but is not always the same as Perl's. It means that if ++ two or more backtracking verbs appear in succession, all the the last + of them has no effect. Consider this example: + + ...(*COMMIT)(*PRUNE)... + + If there is a matching failure to the right, backtracking onto (*PRUNE) +- causes it to be triggered, and its action is taken. There can never be ++ causes it to be triggered, and its action is taken. There can never be + a backtrack onto (*COMMIT). + + Backtracking verbs in repeated groups + +- PCRE differs from Perl in its handling of backtracking verbs in ++ PCRE differs from Perl in its handling of backtracking verbs in + repeated groups. For example, consider: + + /(a(*COMMIT)b)+ac/ + +- If the subject is "abac", Perl matches, but PCRE fails because the ++ If the subject is "abac", Perl matches, but PCRE fails because the + (*COMMIT) in the second repeat of the group acts. + + Backtracking verbs in assertions + +- (*FAIL) in an assertion has its normal effect: it forces an immediate ++ (*FAIL) in an assertion has its normal effect: it forces an immediate + backtrack. + + (*ACCEPT) in a positive assertion causes the assertion to succeed with- +- out any further processing. In a negative assertion, (*ACCEPT) causes ++ out any further processing. In a negative assertion, (*ACCEPT) causes + the assertion to fail without any further processing. + +- The other backtracking verbs are not treated specially if they appear +- in a positive assertion. In particular, (*THEN) skips to the next +- alternative in the innermost enclosing group that has alternations, ++ The other backtracking verbs are not treated specially if they appear ++ in a positive assertion. In particular, (*THEN) skips to the next ++ alternative in the innermost enclosing group that has alternations, + whether or not this is within the assertion. + +- Negative assertions are, however, different, in order to ensure that +- changing a positive assertion into a negative assertion changes its ++ Negative assertions are, however, different, in order to ensure that ++ changing a positive assertion into a negative assertion changes its + result. Backtracking into (*COMMIT), (*SKIP), or (*PRUNE) causes a neg- + ative assertion to be true, without considering any further alternative + branches in the assertion. Backtracking into (*THEN) causes it to skip +- to the next enclosing alternative within the assertion (the normal be- +- haviour), but if the assertion does not have such an alternative, ++ to the next enclosing alternative within the assertion (the normal be- ++ haviour), but if the assertion does not have such an alternative, + (*THEN) behaves like (*PRUNE). + + Backtracking verbs in subroutines + +- These behaviours occur whether or not the subpattern is called recur- ++ These behaviours occur whether or not the subpattern is called recur- + sively. Perl's treatment of subroutines is different in some cases. + +- (*FAIL) in a subpattern called as a subroutine has its normal effect: ++ (*FAIL) in a subpattern called as a subroutine has its normal effect: + it forces an immediate backtrack. + +- (*ACCEPT) in a subpattern called as a subroutine causes the subroutine +- match to succeed without any further processing. Matching then contin- ++ (*ACCEPT) in a subpattern called as a subroutine causes the subroutine ++ match to succeed without any further processing. Matching then contin- + ues after the subroutine call. + + (*COMMIT), (*SKIP), and (*PRUNE) in a subpattern called as a subroutine + cause the subroutine match to fail. + +- (*THEN) skips to the next alternative in the innermost enclosing group +- within the subpattern that has alternatives. If there is no such group ++ (*THEN) skips to the next alternative in the innermost enclosing group ++ within the subpattern that has alternatives. If there is no such group + within the subpattern, (*THEN) causes the subroutine match to fail. + + + SEE ALSO + +- pcreapi(3), pcrecallout(3), pcrematching(3), pcresyntax(3), pcre(3), ++ pcreapi(3), pcrecallout(3), pcrematching(3), pcresyntax(3), pcre(3), + pcre16(3), pcre32(3). + + +@@ -7645,8 +7669,8 @@ AUTHOR + + REVISION + +- Last updated: 08 January 2014 +- Copyright (c) 1997-2014 University of Cambridge. ++ Last updated: 14 June 2015 ++ Copyright (c) 1997-2015 University of Cambridge. + ------------------------------------------------------------------------------ + + +diff --git a/ext/pcre/pcrelib/pcre.h b/ext/pcre/pcrelib/pcre.h +index 58ed46a..bf6351f 100644 +--- a/ext/pcre/pcrelib/pcre.h ++++ b/ext/pcre/pcrelib/pcre.h +@@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE. + /* The current PCRE version information. */ + + #define PCRE_MAJOR 8 +-#define PCRE_MINOR 37 ++#define PCRE_MINOR 38 + #define PCRE_PRERELEASE +-#define PCRE_DATE 2015-04-28 ++#define PCRE_DATE 2015-11-23 + + /* When an application links to a PCRE DLL in Windows, the symbols that are + imported have to be identified as such. When building PCRE, the appropriate +diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c +index 0efad26..4d3b313 100644 +--- a/ext/pcre/pcrelib/pcre_compile.c ++++ b/ext/pcre/pcrelib/pcre_compile.c +@@ -174,7 +174,7 @@ static const short int escapes[] = { + -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, + CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, + CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, +- CHAR_GRAVE_ACCENT, 7, ++ CHAR_GRAVE_ACCENT, ESC_a, + -ESC_b, 0, + -ESC_d, ESC_e, + ESC_f, 0, +@@ -202,9 +202,9 @@ static const short int escapes[] = { + /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', + /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, + /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', +-/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, ++/* 80 */ 0, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, + /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0, +-/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p, ++/* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p, + /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, + /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0, + /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, +@@ -219,6 +219,12 @@ static const short int escapes[] = { + /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, + /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 + }; ++ ++/* We also need a table of characters that may follow \c in an EBCDIC ++environment for characters 0-31. */ ++ ++static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"; ++ + #endif + + +@@ -458,7 +464,7 @@ static const char error_texts[] = + "range out of order in character class\0" + "nothing to repeat\0" + /* 10 */ +- "operand of unlimited repeat could match the empty string\0" /** DEAD **/ ++ "internal error: invalid forward reference offset\0" + "internal error: unexpected repeat\0" + "unrecognized character after (? or (?-\0" + "POSIX named classes are supported only within a class\0" +@@ -527,7 +533,11 @@ static const char error_texts[] = + "different names for subpatterns of the same number are not allowed\0" + "(*MARK) must have an argument\0" + "this version of PCRE is not compiled with Unicode property support\0" ++#ifndef EBCDIC + "\\c must be followed by an ASCII character\0" ++#else ++ "\\c must be followed by a letter or one of [\\]^_?\0" ++#endif + "\\k is not followed by a braced, angle-bracketed, or quoted name\0" + /* 70 */ + "internal error: unknown opcode in find_fixedlength()\0" +@@ -1425,7 +1435,16 @@ else + c ^= 0x40; + #else /* EBCDIC coding */ + if (c >= CHAR_a && c <= CHAR_z) c += 64; +- c ^= 0xC0; ++ if (c == CHAR_QUESTION_MARK) ++ c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; ++ else ++ { ++ for (i = 0; i < 32; i++) ++ { ++ if (c == ebcdic_escape_c[i]) break; ++ } ++ if (i < 32) c = i; else *errorcodeptr = ERR68; ++ } + #endif + break; + +@@ -1799,7 +1818,7 @@ for (;;) + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + do cc += GET(cc, 1); while (*cc == OP_ALT); +- cc += PRIV(OP_lengths)[*cc]; ++ cc += 1 + LINK_SIZE; + break; + + /* Skip over things that don't match chars */ +@@ -2487,7 +2506,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); + if (c == OP_BRA || c == OP_BRAPOS || + c == OP_CBRA || c == OP_CBRAPOS || + c == OP_ONCE || c == OP_ONCE_NC || +- c == OP_COND) ++ c == OP_COND || c == OP_SCOND) + { + BOOL empty_branch; + if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ +@@ -3886,11 +3905,11 @@ didn't consider this to be a POSIX class. Likewise for [:1234:]. + The problem in trying to be exactly like Perl is in the handling of escapes. We + have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX + class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code +-below handles the special case of \], but does not try to do any other escape +-processing. This makes it different from Perl for cases such as [:l\ower:] +-where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize +-"l\ower". This is a lesser evil than not diagnosing bad classes when Perl does, +-I think. ++below handles the special cases \\ and \], but does not try to do any other ++escape processing. This makes it different from Perl for cases such as ++[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does ++not recognize "l\ower". This is a lesser evil than not diagnosing bad classes ++when Perl does, I think. + + A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. + It seems that the appearance of a nested POSIX class supersedes an apparent +@@ -3917,21 +3936,16 @@ pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */ + terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ + for (++ptr; *ptr != CHAR_NULL; ptr++) + { +- if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ++ if (*ptr == CHAR_BACKSLASH && ++ (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ++ ptr[1] == CHAR_BACKSLASH)) + ptr++; +- else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; +- else ++ else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) || ++ *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; ++ else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) + { +- if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) +- { +- *endptr = ptr; +- return TRUE; +- } +- if (*ptr == CHAR_LEFT_SQUARE_BRACKET && +- (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || +- ptr[1] == CHAR_EQUALS_SIGN) && +- check_posix_syntax(ptr, endptr)) +- return FALSE; ++ *endptr = ptr; ++ return TRUE; + } + } + return FALSE; +@@ -3985,11 +3999,12 @@ have their offsets adjusted. That one of the jobs of this function. Before it + is called, the partially compiled regex must be temporarily terminated with + OP_END. + +-This function has been extended with the possibility of forward references for +-recursions and subroutine calls. It must also check the list of such references +-for the group we are dealing with. If it finds that one of the recursions in +-the current group is on this list, it adjusts the offset in the list, not the +-value in the reference (which is a group number). ++This function has been extended to cope with forward references for recursions ++and subroutine calls. It must check the list of such references for the ++group we are dealing with. If it finds that one of the recursions in the ++current group is on this list, it does not adjust the value in the reference ++(which is a group number). After the group has been scanned, all the offsets in ++the forward reference list for the group are adjusted. + + Arguments: + group points to the start of the group +@@ -4005,29 +4020,21 @@ static void + adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd, + size_t save_hwm_offset) + { ++int offset; ++pcre_uchar *hc; + pcre_uchar *ptr = group; + + while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL) + { +- int offset; +- pcre_uchar *hc; +- +- /* See if this recursion is on the forward reference list. If so, adjust the +- reference. */ +- + for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm; + hc += LINK_SIZE) + { + offset = (int)GET(hc, 0); +- if (cd->start_code + offset == ptr + 1) +- { +- PUT(hc, 0, offset + adjust); +- break; +- } ++ if (cd->start_code + offset == ptr + 1) break; + } + +- /* Otherwise, adjust the recursion offset if it's after the start of this +- group. */ ++ /* If we have not found this recursion on the forward reference list, adjust ++ the recursion's offset if it's after the start of this group. */ + + if (hc >= cd->hwm) + { +@@ -4037,6 +4044,15 @@ while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL) + + ptr += 1 + LINK_SIZE; + } ++ ++/* Now adjust all forward reference offsets for the group. */ ++ ++for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm; ++ hc += LINK_SIZE) ++ { ++ offset = (int)GET(hc, 0); ++ PUT(hc, 0, offset + adjust); ++ } + } + + +@@ -4465,7 +4481,7 @@ const pcre_uchar *tempptr; + const pcre_uchar *nestptr = NULL; + pcre_uchar *previous = NULL; + pcre_uchar *previous_callout = NULL; +-size_t save_hwm_offset = 0; ++size_t item_hwm_offset = 0; + pcre_uint8 classbits[32]; + + /* We can fish out the UTF-8 setting once and for all into a BOOL, but we +@@ -4623,8 +4639,7 @@ for (;; ptr++) + /* In the real compile phase, just check the workspace used by the forward + reference list. */ + +- else if (cd->hwm > cd->start_workspace + cd->workspace_size - +- WORK_SIZE_SAFETY_MARGIN) ++ else if (cd->hwm > cd->start_workspace + cd->workspace_size) + { + *errorcodeptr = ERR52; + goto FAILED; +@@ -4767,6 +4782,7 @@ for (;; ptr++) + zeroreqchar = reqchar; + zeroreqcharflags = reqcharflags; + previous = code; ++ item_hwm_offset = cd->hwm - cd->start_workspace; + *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; + break; + +@@ -4818,6 +4834,7 @@ for (;; ptr++) + /* Handle a real character class. */ + + previous = code; ++ item_hwm_offset = cd->hwm - cd->start_workspace; + + /* PCRE supports POSIX class stuff inside a class. Perl gives an error if + they are encountered at the top level, so we'll do that too. */ +@@ -4923,9 +4940,10 @@ for (;; ptr++) + (which is on the stack). We have to remember that there was XCLASS data, + however. */ + ++ if (class_uchardata > class_uchardata_base) xclass = TRUE; ++ + if (lengthptr != NULL && class_uchardata > class_uchardata_base) + { +- xclass = TRUE; + *lengthptr += (int)(class_uchardata - class_uchardata_base); + class_uchardata = class_uchardata_base; + } +@@ -5028,10 +5046,26 @@ for (;; ptr++) + ptr = tempptr + 1; + continue; + +- /* For all other POSIX classes, no special action is taken in UCP +- mode. Fall through to the non_UCP case. */ ++ /* For the other POSIX classes (ascii, xdigit) we are going to fall ++ through to the non-UCP case and build a bit map for characters with ++ code points less than 256. If we are in a negated POSIX class ++ within a non-negated overall class, characters with code points ++ greater than 255 must all match. In the special case where we have ++ not yet generated any xclass data, and this is the final item in ++ the overall class, we need do nothing: later on, the opcode ++ OP_NCLASS will be used to indicate that characters greater than 255 ++ are acceptable. If we have already seen an xclass item or one may ++ follow (we have to assume that it might if this is not the end of ++ the class), explicitly match all wide codepoints. */ + + default: ++ if (!negate_class && local_negate && ++ (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET)) ++ { ++ *class_uchardata++ = XCL_RANGE; ++ class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); ++ class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); ++ } + break; + } + } +@@ -5195,9 +5229,9 @@ for (;; ptr++) + cd, PRIV(vspace_list)); + continue; + +-#ifdef SUPPORT_UCP + case ESC_p: + case ESC_P: ++#ifdef SUPPORT_UCP + { + BOOL negated; + unsigned int ptype = 0, pdata = 0; +@@ -5211,6 +5245,9 @@ for (;; ptr++) + class_has_8bitchar--; /* Undo! */ + continue; + } ++#else ++ *errorcodeptr = ERR45; ++ goto FAILED; + #endif + /* Unrecognized escapes are faulted if PCRE is running in its + strict mode. By default, for compatibility with Perl, they are +@@ -5367,16 +5404,20 @@ for (;; ptr++) + CLASS_SINGLE_CHARACTER: + if (class_one_char < 2) class_one_char++; + +- /* If class_one_char is 1, we have the first single character in the +- class, and there have been no prior ranges, or XCLASS items generated by +- escapes. If this is the final character in the class, we can optimize by +- turning the item into a 1-character OP_CHAR[I] if it's positive, or +- OP_NOT[I] if it's negative. In the positive case, it can cause firstchar +- to be set. Otherwise, there can be no first char if this item is first, +- whatever repeat count may follow. In the case of reqchar, save the +- previous value for reinstating. */ ++ /* If xclass_has_prop is false and class_one_char is 1, we have the first ++ single character in the class, and there have been no prior ranges, or ++ XCLASS items generated by escapes. If this is the final character in the ++ class, we can optimize by turning the item into a 1-character OP_CHAR[I] ++ if it's positive, or OP_NOT[I] if it's negative. In the positive case, it ++ can cause firstchar to be set. Otherwise, there can be no first char if ++ this item is first, whatever repeat count may follow. In the case of ++ reqchar, save the previous value for reinstating. */ + +- if (!inescq && class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ++ if (!inescq && ++#ifdef SUPPORT_UCP ++ !xclass_has_prop && ++#endif ++ class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) + { + ptr++; + zeroreqchar = reqchar; +@@ -5492,9 +5533,10 @@ for (;; ptr++) + actual compiled code. */ + + #ifdef SUPPORT_UTF +- if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0)) ++ if (xclass && (xclass_has_prop || !should_flip_negation || ++ (options & PCRE_UCP) != 0)) + #elif !defined COMPILE_PCRE8 +- if (xclass && !should_flip_negation) ++ if (xclass && (xclass_has_prop || !should_flip_negation)) + #endif + #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 + { +@@ -5930,7 +5972,7 @@ for (;; ptr++) + { + register int i; + int len = (int)(code - previous); +- size_t base_hwm_offset = save_hwm_offset; ++ size_t base_hwm_offset = item_hwm_offset; + pcre_uchar *bralink = NULL; + pcre_uchar *brazeroptr = NULL; + +@@ -5985,7 +6027,7 @@ for (;; ptr++) + if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ + { + *code = OP_END; +- adjust_recurse(previous, 1, utf, cd, save_hwm_offset); ++ adjust_recurse(previous, 1, utf, cd, item_hwm_offset); + memmove(previous + 1, previous, IN_UCHARS(len)); + code++; + if (repeat_max == 0) +@@ -6009,7 +6051,7 @@ for (;; ptr++) + { + int offset; + *code = OP_END; +- adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm_offset); ++ adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset); + memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len)); + code += 2 + LINK_SIZE; + *previous++ = OP_BRAZERO + repeat_type; +@@ -6254,6 +6296,12 @@ for (;; ptr++) + while (*scode == OP_ALT); + } + ++ /* A conditional group with only one branch has an implicit empty ++ alternative branch. */ ++ ++ if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT) ++ *bracode = OP_SCOND; ++ + /* Handle possessive quantifiers. */ + + if (possessive_quantifier) +@@ -6267,11 +6315,11 @@ for (;; ptr++) + { + int nlen = (int)(code - bracode); + *code = OP_END; +- adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm_offset); ++ adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset); + memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen)); + code += 1 + LINK_SIZE; + nlen += 1 + LINK_SIZE; +- *bracode = OP_BRAPOS; ++ *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS; + *code++ = OP_KETRPOS; + PUTINC(code, 0, nlen); + PUT(bracode, 1, nlen); +@@ -6401,7 +6449,7 @@ for (;; ptr++) + else + { + *code = OP_END; +- adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset); ++ adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset); + memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); + code += 1 + LINK_SIZE; + len += 1 + LINK_SIZE; +@@ -6450,7 +6498,7 @@ for (;; ptr++) + + default: + *code = OP_END; +- adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset); ++ adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset); + memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); + code += 1 + LINK_SIZE; + len += 1 + LINK_SIZE; +@@ -6586,9 +6634,17 @@ for (;; ptr++) + goto FAILED; + } + setverb = *code++ = verbs[i].op_arg; +- *code++ = arglen; +- memcpy(code, arg, IN_UCHARS(arglen)); +- code += arglen; ++ if (lengthptr != NULL) /* In pass 1 just add in the length */ ++ { /* to avoid potential workspace */ ++ *lengthptr += arglen; /* overflow. */ ++ *code++ = 0; ++ } ++ else ++ { ++ *code++ = arglen; ++ memcpy(code, arg, IN_UCHARS(arglen)); ++ code += arglen; ++ } + *code++ = 0; + } + +@@ -6623,7 +6679,7 @@ for (;; ptr++) + newoptions = options; + skipbytes = 0; + bravalue = OP_CBRA; +- save_hwm_offset = cd->hwm - cd->start_workspace; ++ item_hwm_offset = cd->hwm - cd->start_workspace; + reset_bracount = FALSE; + + /* Deal with the extended parentheses; all are introduced by '?', and the +@@ -6641,6 +6697,7 @@ for (;; ptr++) + /* ------------------------------------------------------------ */ + case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */ + reset_bracount = TRUE; ++ cd->dupgroups = TRUE; /* Record (?| encountered */ + /* Fall through */ + + /* ------------------------------------------------------------ */ +@@ -6741,6 +6798,12 @@ for (;; ptr++) + { + while (IS_DIGIT(*ptr)) + { ++ if (recno > INT_MAX / 10 - 1) /* Integer overflow */ ++ { ++ while (IS_DIGIT(*ptr)) ptr++; ++ *errorcodeptr = ERR61; ++ goto FAILED; ++ } + recno = recno * 10 + (int)(*ptr - CHAR_0); + ptr++; + } +@@ -6769,7 +6832,7 @@ for (;; ptr++) + ptr++; + } + namelen = (int)(ptr - name); +- if (lengthptr != NULL) *lengthptr += IMM2_SIZE; ++ if (lengthptr != NULL) skipbytes += IMM2_SIZE; + } + + /* Check the terminator */ +@@ -6875,6 +6938,11 @@ for (;; ptr++) + *errorcodeptr = ERR15; + goto FAILED; + } ++ if (recno > INT_MAX / 10 - 1) /* Integer overflow */ ++ { ++ *errorcodeptr = ERR61; ++ goto FAILED; ++ } + recno = recno * 10 + name[i] - CHAR_0; + } + if (recno == 0) recno = RREF_ANY; +@@ -7151,6 +7219,7 @@ for (;; ptr++) + if (lengthptr != NULL) + { + named_group *ng; ++ recno = 0; + + if (namelen == 0) + { +@@ -7168,20 +7237,6 @@ for (;; ptr++) + goto FAILED; + } + +- /* The name table does not exist in the first pass; instead we must +- scan the list of names encountered so far in order to get the +- number. If the name is not found, set the value to 0 for a forward +- reference. */ +- +- ng = cd->named_groups; +- for (i = 0; i < cd->names_found; i++, ng++) +- { +- if (namelen == ng->length && +- STRNCMP_UC_UC(name, ng->name, namelen) == 0) +- break; +- } +- recno = (i < cd->names_found)? ng->number : 0; +- + /* Count named back references. */ + + if (!is_recurse) cd->namedrefcount++; +@@ -7191,6 +7246,56 @@ for (;; ptr++) + 16-bit data item. */ + + *lengthptr += IMM2_SIZE; ++ ++ /* If this is a forward reference and we are within a (?|...) group, ++ the reference may end up as the number of a group which we are ++ currently inside, that is, it could be a recursive reference. In the ++ real compile this will be picked up and the reference wrapped with ++ OP_ONCE to make it atomic, so we must space in case this occurs. */ ++ ++ /* In fact, this can happen for a non-forward reference because ++ another group with the same number might be created later. This ++ issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance ++ only mode, we finesse the bug by allowing more memory always. */ ++ ++ *lengthptr += 2 + 2*LINK_SIZE; ++ ++ /* It is even worse than that. The current reference may be to an ++ existing named group with a different number (so apparently not ++ recursive) but which later on is also attached to a group with the ++ current number. This can only happen if $(| has been previous ++ encountered. In that case, we allow yet more memory, just in case. ++ (Again, this is fixed "properly" in PCRE2. */ ++ ++ if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE; ++ ++ /* Otherwise, check for recursion here. The name table does not exist ++ in the first pass; instead we must scan the list of names encountered ++ so far in order to get the number. If the name is not found, leave ++ the value of recno as 0 for a forward reference. */ ++ ++ else ++ { ++ ng = cd->named_groups; ++ for (i = 0; i < cd->names_found; i++, ng++) ++ { ++ if (namelen == ng->length && ++ STRNCMP_UC_UC(name, ng->name, namelen) == 0) ++ { ++ open_capitem *oc; ++ recno = ng->number; ++ if (is_recurse) break; ++ for (oc = cd->open_caps; oc != NULL; oc = oc->next) ++ { ++ if (oc->number == recno) ++ { ++ oc->flag = TRUE; ++ break; ++ } ++ } ++ } ++ } ++ } + } + + /* In the real compile, search the name table. We check the name +@@ -7237,8 +7342,6 @@ for (;; ptr++) + for (i++; i < cd->names_found; i++) + { + if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break; +- +- + count++; + cslot += cd->name_entry_size; + } +@@ -7247,6 +7350,7 @@ for (;; ptr++) + { + if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; + previous = code; ++ item_hwm_offset = cd->hwm - cd->start_workspace; + *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF; + PUT2INC(code, 0, index); + PUT2INC(code, 0, count); +@@ -7284,9 +7388,14 @@ for (;; ptr++) + + + /* ------------------------------------------------------------ */ +- case CHAR_R: /* Recursion */ +- ptr++; /* Same as (?0) */ +- /* Fall through */ ++ case CHAR_R: /* Recursion, same as (?0) */ ++ recno = 0; ++ if (*(++ptr) != CHAR_RIGHT_PARENTHESIS) ++ { ++ *errorcodeptr = ERR29; ++ goto FAILED; ++ } ++ goto HANDLE_RECURSION; + + + /* ------------------------------------------------------------ */ +@@ -7323,7 +7432,15 @@ for (;; ptr++) + + recno = 0; + while(IS_DIGIT(*ptr)) ++ { ++ if (recno > INT_MAX / 10 - 1) /* Integer overflow */ ++ { ++ while (IS_DIGIT(*ptr)) ptr++; ++ *errorcodeptr = ERR61; ++ goto FAILED; ++ } + recno = recno * 10 + *ptr++ - CHAR_0; ++ } + + if (*ptr != (pcre_uchar)terminator) + { +@@ -7360,6 +7477,7 @@ for (;; ptr++) + HANDLE_RECURSION: + + previous = code; ++ item_hwm_offset = cd->hwm - cd->start_workspace; + called = cd->start_code; + + /* When we are actually compiling, find the bracket that is being +@@ -7561,7 +7679,11 @@ for (;; ptr++) + previous = NULL; + cd->iscondassert = FALSE; + } +- else previous = code; ++ else ++ { ++ previous = code; ++ item_hwm_offset = cd->hwm - cd->start_workspace; ++ } + + *code = bravalue; + tempcode = code; +@@ -7809,7 +7931,7 @@ for (;; ptr++) + const pcre_uchar *p; + pcre_uint32 cf; + +- save_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */ ++ item_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */ + terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? + CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; + +@@ -7838,7 +7960,7 @@ for (;; ptr++) + if (*p != (pcre_uchar)terminator) + { + *errorcodeptr = ERR57; +- break; ++ goto FAILED; + } + ptr++; + goto HANDLE_NUMERICAL_RECURSION; +@@ -7853,7 +7975,7 @@ for (;; ptr++) + ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET)) + { + *errorcodeptr = ERR69; +- break; ++ goto FAILED; + } + is_recurse = FALSE; + terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? +@@ -7877,6 +7999,7 @@ for (;; ptr++) + HANDLE_REFERENCE: + if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; + previous = code; ++ item_hwm_offset = cd->hwm - cd->start_workspace; + *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF; + PUT2INC(code, 0, recno); + cd->backref_map |= (recno < 32)? (1 << recno) : 1; +@@ -7906,6 +8029,7 @@ for (;; ptr++) + if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr)) + goto FAILED; + previous = code; ++ item_hwm_offset = cd->hwm - cd->start_workspace; + *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP; + *code++ = ptype; + *code++ = pdata; +@@ -7946,6 +8070,7 @@ for (;; ptr++) + + { + previous = (escape > ESC_b && escape < ESC_Z)? code : NULL; ++ item_hwm_offset = cd->hwm - cd->start_workspace; + *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape; + } + } +@@ -7989,6 +8114,7 @@ for (;; ptr++) + + ONE_CHAR: + previous = code; ++ item_hwm_offset = cd->hwm - cd->start_workspace; + + /* For caseless UTF-8 mode when UCP support is available, check whether + this character has more than one other case. If so, generate a special +@@ -9164,6 +9290,7 @@ cd->names_found = 0; + cd->name_entry_size = 0; + cd->name_table = NULL; + cd->dupnames = FALSE; ++cd->dupgroups = FALSE; + cd->namedrefcount = 0; + cd->start_code = cworkspace; + cd->hwm = cworkspace; +@@ -9336,6 +9463,16 @@ if (cd->hwm > cd->start_workspace) + int offset, recno; + cd->hwm -= LINK_SIZE; + offset = GET(cd->hwm, 0); ++ ++ /* Check that the hwm handling hasn't gone wrong. This whole area is ++ rewritten in PCRE2 because there are some obscure cases. */ ++ ++ if (offset == 0 || codestart[offset-1] != OP_RECURSE) ++ { ++ errorcode = ERR10; ++ break; ++ } ++ + recno = GET(codestart, offset); + if (recno != prev_recno) + { +@@ -9366,7 +9503,7 @@ used in this code because at least one compiler gives a warning about loss of + "const" attribute if the cast (pcre_uchar *)codestart is used directly in the + function call. */ + +-if ((options & PCRE_NO_AUTO_POSSESS) == 0) ++if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0) + { + pcre_uchar *temp = (pcre_uchar *)codestart; + auto_possessify(temp, utf, cd); +@@ -9380,7 +9517,7 @@ OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The + exceptional ones forgo this. We scan the pattern to check that they are fixed + length, and set their lengths. */ + +-if (cd->check_lookbehind) ++if (errorcode == 0 && cd->check_lookbehind) + { + pcre_uchar *cc = (pcre_uchar *)codestart; + +@@ -9593,4 +9730,3 @@ return (pcre32 *)re; + } + + /* End of pcre_compile.c */ +- +diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c +index 3942076..24b23ca 100644 +--- a/ext/pcre/pcrelib/pcre_exec.c ++++ b/ext/pcre/pcrelib/pcre_exec.c +@@ -688,7 +688,7 @@ the alternative names that are used. */ + #define foc number + #define save_mark data + +-/* These statements are here to stop the compiler complaining about uninitialized ++/* These statements are here to stop the compiler complaining about unitialized + variables. */ + + #ifdef SUPPORT_UCP +@@ -6685,7 +6685,8 @@ if (md->offset_vector != NULL) + register int *iend = iptr - re->top_bracket; + if (iend < md->offset_vector + 2) iend = md->offset_vector + 2; + while (--iptr >= iend) *iptr = -1; +- md->offset_vector[0] = md->offset_vector[1] = -1; ++ if (offsetcount > 0) md->offset_vector[0] = -1; ++ if (offsetcount > 1) md->offset_vector[1] = -1; + } + + /* Set up the first character to match, if available. The first_char value is +diff --git a/ext/pcre/pcrelib/pcre_internal.h b/ext/pcre/pcrelib/pcre_internal.h +index 4c4817d..aec1879 100644 +--- a/ext/pcre/pcrelib/pcre_internal.h ++++ b/ext/pcre/pcrelib/pcre_internal.h +@@ -988,7 +988,7 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */ + #ifndef EBCDIC + + #define HSPACE_LIST \ +- CHAR_HT, CHAR_SPACE, 0xa0, \ ++ CHAR_HT, CHAR_SPACE, CHAR_NBSP, \ + 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \ + 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \ + NOTACHAR +@@ -1014,7 +1014,7 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */ + #define HSPACE_BYTE_CASES \ + case CHAR_HT: \ + case CHAR_SPACE: \ +- case 0xa0 /* NBSP */ ++ case CHAR_NBSP + + #define HSPACE_CASES \ + HSPACE_BYTE_CASES: \ +@@ -1041,11 +1041,12 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */ + /* ------ EBCDIC environments ------ */ + + #else +-#define HSPACE_LIST CHAR_HT, CHAR_SPACE ++#define HSPACE_LIST CHAR_HT, CHAR_SPACE, CHAR_NBSP, NOTACHAR + + #define HSPACE_BYTE_CASES \ + case CHAR_HT: \ +- case CHAR_SPACE ++ case CHAR_SPACE: \ ++ case CHAR_NBSP + + #define HSPACE_CASES HSPACE_BYTE_CASES + +@@ -1219,6 +1220,7 @@ same code point. */ + + #define CHAR_ESC '\047' + #define CHAR_DEL '\007' ++#define CHAR_NBSP '\x41' + #define STR_ESC "\047" + #define STR_DEL "\007" + +@@ -1233,6 +1235,7 @@ a positive value. */ + #define CHAR_NEL ((unsigned char)'\x85') + #define CHAR_ESC '\033' + #define CHAR_DEL '\177' ++#define CHAR_NBSP ((unsigned char)'\xa0') + + #define STR_LF "\n" + #define STR_NL STR_LF +@@ -1610,6 +1613,7 @@ only. */ + #define CHAR_VERTICAL_LINE '\174' + #define CHAR_RIGHT_CURLY_BRACKET '\175' + #define CHAR_TILDE '\176' ++#define CHAR_NBSP ((unsigned char)'\xa0') + + #define STR_HT "\011" + #define STR_VT "\013" +@@ -1766,6 +1770,10 @@ only. */ + + /* Escape items that are just an encoding of a particular data value. */ + ++#ifndef ESC_a ++#define ESC_a CHAR_BEL ++#endif ++ + #ifndef ESC_e + #define ESC_e CHAR_ESC + #endif +@@ -2450,6 +2458,7 @@ typedef struct compile_data { + BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ + BOOL check_lookbehind; /* Lookbehinds need later checking */ + BOOL dupnames; /* Duplicate names exist */ ++ BOOL dupgroups; /* Duplicate groups exist: (?| found */ + BOOL iscondassert; /* Next assert is a condition */ + int nltype; /* Newline type */ + int nllen; /* Newline string length */ +diff --git a/ext/pcre/pcrelib/pcre_jit_compile.c b/ext/pcre/pcrelib/pcre_jit_compile.c +index debdf6e..445de0c 100644 +--- a/ext/pcre/pcrelib/pcre_jit_compile.c ++++ b/ext/pcre/pcrelib/pcre_jit_compile.c +@@ -1064,6 +1064,7 @@ pcre_uchar *alternative; + pcre_uchar *end = NULL; + int private_data_ptr = *private_data_start; + int space, size, bracketlen; ++BOOL repeat_check = TRUE; + + while (cc < ccend) + { +@@ -1071,9 +1072,10 @@ while (cc < ccend) + size = 0; + bracketlen = 0; + if (private_data_ptr > SLJIT_MAX_LOCAL_SIZE) +- return; ++ break; + +- if (*cc == OP_ONCE || *cc == OP_ONCE_NC || *cc == OP_BRA || *cc == OP_CBRA || *cc == OP_COND) ++ if (repeat_check && (*cc == OP_ONCE || *cc == OP_ONCE_NC || *cc == OP_BRA || *cc == OP_CBRA || *cc == OP_COND)) ++ { + if (detect_repeat(common, cc)) + { + /* These brackets are converted to repeats, so no global +@@ -1081,6 +1083,8 @@ while (cc < ccend) + if (cc >= end) + end = bracketend(cc); + } ++ } ++ repeat_check = TRUE; + + switch(*cc) + { +@@ -1136,6 +1140,13 @@ while (cc < ccend) + bracketlen = 1 + LINK_SIZE + IMM2_SIZE; + break; + ++ case OP_BRAZERO: ++ case OP_BRAMINZERO: ++ case OP_BRAPOSZERO: ++ repeat_check = FALSE; ++ size = 1; ++ break; ++ + CASE_ITERATOR_PRIVATE_DATA_1 + space = 1; + size = -2; +@@ -1162,12 +1173,17 @@ while (cc < ccend) + size = 1; + break; + +- CASE_ITERATOR_TYPE_PRIVATE_DATA_2B ++ case OP_TYPEUPTO: + if (cc[1 + IMM2_SIZE] != OP_ANYNL && cc[1 + IMM2_SIZE] != OP_EXTUNI) + space = 2; + size = 1 + IMM2_SIZE; + break; + ++ case OP_TYPEMINUPTO: ++ space = 2; ++ size = 1 + IMM2_SIZE; ++ break; ++ + case OP_CLASS: + case OP_NCLASS: + size += 1 + 32 / sizeof(pcre_uchar); +@@ -1316,6 +1332,13 @@ while (cc < ccend) + cc += 1 + LINK_SIZE + IMM2_SIZE; + break; + ++ case OP_THEN: ++ stack_restore = TRUE; ++ if (common->control_head_ptr != 0) ++ *needs_control_head = TRUE; ++ cc ++; ++ break; ++ + default: + stack_restore = TRUE; + /* Fall through. */ +@@ -2220,6 +2243,7 @@ while (current != NULL) + SLJIT_ASSERT_STOP(); + break; + } ++ SLJIT_ASSERT(current > (sljit_sw*)current[-1]); + current = (sljit_sw*)current[-1]; + } + return -1; +@@ -3209,7 +3233,7 @@ bytes[len] = byte; + bytes[0] = len; + } + +-static int scan_prefix(compiler_common *common, pcre_uchar *cc, pcre_uint32 *chars, pcre_uint8 *bytes, int max_chars) ++static int scan_prefix(compiler_common *common, pcre_uchar *cc, pcre_uint32 *chars, pcre_uint8 *bytes, int max_chars, pcre_uint32 *rec_count) + { + /* Recursive function, which scans prefix literals. */ + BOOL last, any, caseless; +@@ -3227,9 +3251,14 @@ pcre_uchar othercase[1]; + repeat = 1; + while (TRUE) + { ++ if (*rec_count == 0) ++ return 0; ++ (*rec_count)--; ++ + last = TRUE; + any = FALSE; + caseless = FALSE; ++ + switch (*cc) + { + case OP_CHARI: +@@ -3291,7 +3320,7 @@ while (TRUE) + #ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc); + #endif +- max_chars = scan_prefix(common, cc + len, chars, bytes, max_chars); ++ max_chars = scan_prefix(common, cc + len, chars, bytes, max_chars, rec_count); + if (max_chars == 0) + return consumed; + last = FALSE; +@@ -3314,7 +3343,7 @@ while (TRUE) + alternative = cc + GET(cc, 1); + while (*alternative == OP_ALT) + { +- max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, bytes, max_chars); ++ max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, bytes, max_chars, rec_count); + if (max_chars == 0) + return consumed; + alternative += GET(alternative, 1); +@@ -3556,6 +3585,7 @@ int i, max, from; + int range_right = -1, range_len = 3 - 1; + sljit_ub *update_table = NULL; + BOOL in_range; ++pcre_uint32 rec_count; + + for (i = 0; i < MAX_N_CHARS; i++) + { +@@ -3564,7 +3594,8 @@ for (i = 0; i < MAX_N_CHARS; i++) + bytes[i * MAX_N_BYTES] = 0; + } + +-max = scan_prefix(common, common->start, chars, bytes, MAX_N_CHARS); ++rec_count = 10000; ++max = scan_prefix(common, common->start, chars, bytes, MAX_N_CHARS, &rec_count); + + if (max <= 1) + return FALSE; +@@ -4311,8 +4342,10 @@ switch(length) + case 4: + if ((ranges[1] - ranges[0]) == (ranges[3] - ranges[2]) + && (ranges[0] | (ranges[2] - ranges[0])) == ranges[2] ++ && (ranges[1] & (ranges[2] - ranges[0])) == 0 + && is_powerof2(ranges[2] - ranges[0])) + { ++ SLJIT_ASSERT((ranges[0] & (ranges[2] - ranges[0])) == 0 && (ranges[2] & ranges[3] & (ranges[2] - ranges[0])) != 0); + OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2] - ranges[0]); + if (ranges[2] + 1 != ranges[3]) + { +@@ -4900,9 +4933,10 @@ else if ((cc[-1] & XCL_MAP) != 0) + if (!check_class_ranges(common, (const pcre_uint8 *)cc, FALSE, TRUE, list)) + { + #ifdef COMPILE_PCRE8 +- SLJIT_ASSERT(common->utf); ++ jump = NULL; ++ if (common->utf) + #endif +- jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); ++ jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); + + OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); + OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); +@@ -4911,7 +4945,10 @@ else if ((cc[-1] & XCL_MAP) != 0) + OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); + add_jump(compiler, list, JUMP(SLJIT_NOT_ZERO)); + +- JUMPHERE(jump); ++#ifdef COMPILE_PCRE8 ++ if (common->utf) ++#endif ++ JUMPHERE(jump); + } + + OP1(SLJIT_MOV, TMP1, 0, TMP3, 0); +@@ -5219,7 +5256,7 @@ while (*cc != XCL_END) + OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL); + + SET_CHAR_OFFSET(0); +- OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xff); ++ OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x7f); + OP_FLAGS(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL); + + SET_TYPE_OFFSET(ucp_Pc); +@@ -7665,6 +7702,10 @@ while (*cc != OP_KETRPOS) + OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); + } + ++ /* Even if the match is empty, we need to reset the control head. */ ++ if (needs_control_head) ++ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack)); ++ + if (opcode == OP_SBRAPOS || opcode == OP_SCBRAPOS) + add_jump(compiler, &emptymatch, CMP(SLJIT_EQUAL, TMP1, 0, STR_PTR, 0)); + +@@ -7692,6 +7733,10 @@ while (*cc != OP_KETRPOS) + OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), (framesize + 1) * sizeof(sljit_sw), STR_PTR, 0); + } + ++ /* Even if the match is empty, we need to reset the control head. */ ++ if (needs_control_head) ++ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack)); ++ + if (opcode == OP_SBRAPOS || opcode == OP_SCBRAPOS) + add_jump(compiler, &emptymatch, CMP(SLJIT_EQUAL, TMP1, 0, STR_PTR, 0)); + +@@ -7704,9 +7749,6 @@ while (*cc != OP_KETRPOS) + } + } + +- if (needs_control_head) +- OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack)); +- + JUMPTO(SLJIT_JUMP, loop); + flush_stubs(common); + +@@ -8441,8 +8483,7 @@ while (cc < ccend) + OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), STR_PTR, 0); + } + BACKTRACK_AS(braminzero_backtrack)->matchingpath = LABEL(); +- if (cc[1] > OP_ASSERTBACK_NOT) +- count_match(common); ++ count_match(common); + break; + + case OP_ONCE: +@@ -9624,7 +9665,7 @@ static SLJIT_INLINE void compile_recurse(compiler_common *common) + DEFINE_COMPILER; + pcre_uchar *cc = common->start + common->currententry->start; + pcre_uchar *ccbegin = cc + 1 + LINK_SIZE + (*cc == OP_BRA ? 0 : IMM2_SIZE); +-pcre_uchar *ccend = bracketend(cc); ++pcre_uchar *ccend = bracketend(cc) - (1 + LINK_SIZE); + BOOL needs_control_head; + int framesize = get_framesize(common, cc, NULL, TRUE, &needs_control_head); + int private_data_size = get_private_data_copy_length(common, ccbegin, ccend, needs_control_head); +@@ -9648,6 +9689,7 @@ set_jumps(common->currententry->calls, common->currententry->entry); + + sljit_emit_fast_enter(compiler, TMP2, 0); + allocate_stack(common, private_data_size + framesize + alternativesize); ++count_match(common); + OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(private_data_size + framesize + alternativesize - 1), TMP2, 0); + copy_private_data(common, ccbegin, ccend, TRUE, private_data_size + framesize + alternativesize, framesize + alternativesize, needs_control_head); + if (needs_control_head) +@@ -9992,6 +10034,7 @@ OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, stack)); + OP1(SLJIT_MOV_UI, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, limit_match)); + OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(struct sljit_stack, base)); + OP1(SLJIT_MOV, STACK_LIMIT, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(struct sljit_stack, limit)); ++OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LIMIT_MATCH, TMP1, 0); + + if (mode == JIT_PARTIAL_SOFT_COMPILE) +diff --git a/ext/pcre/pcrelib/pcre_study.c b/ext/pcre/pcrelib/pcre_study.c +index 998fe23..7fd0ba0 100644 +--- a/ext/pcre/pcrelib/pcre_study.c ++++ b/ext/pcre/pcrelib/pcre_study.c +@@ -71,6 +71,7 @@ rather than bytes. + startcode pointer to start of the whole pattern's code + options the compiling options + recurses chain of recurse_check to catch mutual recursion ++ countptr pointer to call count (to catch over complexity) + + Returns: the minimum length + -1 if \C in UTF-8 mode or (*ACCEPT) was encountered +@@ -80,7 +81,8 @@ Returns: the minimum length + + static int + find_minlength(const REAL_PCRE *re, const pcre_uchar *code, +- const pcre_uchar *startcode, int options, recurse_check *recurses) ++ const pcre_uchar *startcode, int options, recurse_check *recurses, ++ int *countptr) + { + int length = -1; + /* PCRE_UTF16 has the same value as PCRE_UTF8. */ +@@ -90,6 +92,8 @@ recurse_check this_recurse; + register int branchlength = 0; + register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE; + ++if ((*countptr)++ > 1000) return -1; /* too complex */ ++ + if (*code == OP_CBRA || *code == OP_SCBRA || + *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE; + +@@ -131,7 +135,7 @@ for (;;) + case OP_SBRAPOS: + case OP_ONCE: + case OP_ONCE_NC: +- d = find_minlength(re, cc, startcode, options, recurses); ++ d = find_minlength(re, cc, startcode, options, recurses, countptr); + if (d < 0) return d; + branchlength += d; + do cc += GET(cc, 1); while (*cc == OP_ALT); +@@ -415,7 +419,8 @@ for (;;) + int dd; + this_recurse.prev = recurses; + this_recurse.group = cs; +- dd = find_minlength(re, cs, startcode, options, &this_recurse); ++ dd = find_minlength(re, cs, startcode, options, &this_recurse, ++ countptr); + if (dd < d) d = dd; + } + } +@@ -451,7 +456,8 @@ for (;;) + { + this_recurse.prev = recurses; + this_recurse.group = cs; +- d = find_minlength(re, cs, startcode, options, &this_recurse); ++ d = find_minlength(re, cs, startcode, options, &this_recurse, ++ countptr); + } + } + } +@@ -514,7 +520,7 @@ for (;;) + this_recurse.prev = recurses; + this_recurse.group = cs; + branchlength += find_minlength(re, cs, startcode, options, +- &this_recurse); ++ &this_recurse, countptr); + } + } + cc += 1 + LINK_SIZE; +@@ -1453,6 +1459,7 @@ pcre32_study(const pcre32 *external_re, int options, const char **errorptr) + #endif + { + int min; ++int count = 0; + BOOL bits_set = FALSE; + pcre_uint8 start_bits[32]; + PUBL(extra) *extra = NULL; +@@ -1539,7 +1546,7 @@ if ((re->options & PCRE_ANCHORED) == 0 && + + /* Find the minimum length of subject string. */ + +-switch(min = find_minlength(re, code, code, re->options, NULL)) ++switch(min = find_minlength(re, code, code, re->options, NULL, &count)) + { + case -2: *errorptr = "internal error: missing capturing bracket"; return NULL; + case -3: *errorptr = "internal error: opcode not recognized"; return NULL; +diff --git a/ext/pcre/pcrelib/pcre_xclass.c b/ext/pcre/pcrelib/pcre_xclass.c +index c2b61f0..ef759a5 100644 +--- a/ext/pcre/pcrelib/pcre_xclass.c ++++ b/ext/pcre/pcrelib/pcre_xclass.c +@@ -246,7 +246,7 @@ while ((t = *data++) != XCL_END) + + case PT_PXPUNCT: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P || +- (c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop) ++ (c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop) + return !negated; + break; + +diff --git a/ext/pcre/pcrelib/sljit/sljitConfig.h b/ext/pcre/pcrelib/sljit/sljitConfig.h +index 10364c3..1c8a521 100644 +--- a/ext/pcre/pcrelib/sljit/sljitConfig.h ++++ b/ext/pcre/pcrelib/sljit/sljitConfig.h +@@ -96,6 +96,15 @@ + #define SLJIT_EXECUTABLE_ALLOCATOR 1 + #endif + ++/* Force cdecl calling convention even if a better calling ++ convention (e.g. fastcall) is supported by the C compiler. ++ If this option is enabled, C functions without ++ SLJIT_CALL can also be called from JIT code. */ ++#ifndef SLJIT_USE_CDECL_CALLING_CONVENTION ++/* Disabled by default */ ++#define SLJIT_USE_CDECL_CALLING_CONVENTION 0 ++#endif ++ + /* Return with error when an invalid argument is passed. */ + #ifndef SLJIT_ARGUMENT_CHECKS + /* Disabled by default */ +diff --git a/ext/pcre/pcrelib/sljit/sljitConfigInternal.h b/ext/pcre/pcrelib/sljit/sljitConfigInternal.h +index 3284012..16e3547 100644 +--- a/ext/pcre/pcrelib/sljit/sljitConfigInternal.h ++++ b/ext/pcre/pcrelib/sljit/sljitConfigInternal.h +@@ -468,7 +468,12 @@ typedef double sljit_d; + + #ifndef SLJIT_CALL + +-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) ++#if (defined SLJIT_USE_CDECL_CALLING_CONVENTION && SLJIT_USE_CDECL_CALLING_CONVENTION) ++ ++/* Force cdecl. */ ++#define SLJIT_CALL ++ ++#elif (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + + #if defined(__GNUC__) && !defined(__APPLE__) + +@@ -608,6 +613,12 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void); + #define SLJIT_LOCALS_OFFSET_BASE ((23 + 1) * sizeof(sljit_sw)) + #endif + ++#elif (defined SLJIT_CONFIG_TILEGX && SLJIT_CONFIG_TILEGX) ++ ++#define SLJIT_NUMBER_OF_REGISTERS 10 ++#define SLJIT_NUMBER_OF_SAVED_REGISTERS 5 ++#define SLJIT_LOCALS_OFFSET_BASE 0 ++ + #elif (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED) + + #define SLJIT_NUMBER_OF_REGISTERS 0 +diff --git a/ext/pcre/pcrelib/sljit/sljitLir.c b/ext/pcre/pcrelib/sljit/sljitLir.c +index 5039a7e..0f1b1c9 100644 +--- a/ext/pcre/pcrelib/sljit/sljitLir.c ++++ b/ext/pcre/pcrelib/sljit/sljitLir.c +@@ -845,8 +845,8 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_compiler_verbose(struct sljit_compiler *comp + } + + static SLJIT_CONST char* op0_names[] = { +- (char*)"breakpoint", (char*)"nop", +- (char*)"lumul", (char*)"lsmul", (char*)"ludiv", (char*)"lsdiv", ++ (char*)"breakpoint", (char*)"nop", (char*)"lumul", (char*)"lsmul", ++ (char*)"udivmod", (char*)"sdivmod", (char*)"udivi", (char*)"sdivi" + }; + + static SLJIT_CONST char* op1_names[] = { +@@ -1036,7 +1036,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op0(struct sljit_compiler + { + #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) + CHECK_ARGUMENT((op >= SLJIT_BREAKPOINT && op <= SLJIT_LSMUL) +- || ((op & ~SLJIT_INT_OP) >= SLJIT_LUDIV && (op & ~SLJIT_INT_OP) <= SLJIT_LSDIV)); ++ || ((op & ~SLJIT_INT_OP) >= SLJIT_UDIVMOD && (op & ~SLJIT_INT_OP) <= SLJIT_SDIVI)); + CHECK_ARGUMENT(op < SLJIT_LUMUL || compiler->scratches >= 2); + #endif + #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) +@@ -1447,6 +1447,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_flags(struct sljit_com + + static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_local_base(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw offset) + { ++ SLJIT_UNUSED_ARG(offset); ++ + #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) + FUNCTION_CHECK_DST(dst, dstw); + #endif +@@ -1462,6 +1464,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_local_base(struct sljit_co + + static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value) + { ++ SLJIT_UNUSED_ARG(init_value); ++ + #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) + FUNCTION_CHECK_DST(dst, dstw); + #endif +diff --git a/ext/pcre/pcrelib/sljit/sljitLir.h b/ext/pcre/pcrelib/sljit/sljitLir.h +index 24c0f60..2e2e9ac09 100644 +--- a/ext/pcre/pcrelib/sljit/sljitLir.h ++++ b/ext/pcre/pcrelib/sljit/sljitLir.h +@@ -687,7 +687,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler * + #define SLJIT_OP0_BASE 0 + + /* Flags: - (never set any flags) +- Note: breakpoint instruction is not supported by all architectures (namely ppc) ++ Note: breakpoint instruction is not supported by all architectures (e.g. ppc) + It falls back to SLJIT_NOP in those cases. */ + #define SLJIT_BREAKPOINT (SLJIT_OP0_BASE + 0) + /* Flags: - (never set any flags) +@@ -696,24 +696,42 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler * + #define SLJIT_NOP (SLJIT_OP0_BASE + 1) + /* Flags: - (may destroy flags) + Unsigned multiplication of SLJIT_R0 and SLJIT_R1. +- Result goes to SLJIT_R1:SLJIT_R0 (high:low) word */ ++ Result is placed into SLJIT_R1:SLJIT_R0 (high:low) word */ + #define SLJIT_LUMUL (SLJIT_OP0_BASE + 2) + /* Flags: - (may destroy flags) + Signed multiplication of SLJIT_R0 and SLJIT_R1. +- Result goes to SLJIT_R1:SLJIT_R0 (high:low) word */ ++ Result is placed into SLJIT_R1:SLJIT_R0 (high:low) word */ + #define SLJIT_LSMUL (SLJIT_OP0_BASE + 3) + /* Flags: I - (may destroy flags) + Unsigned divide of the value in SLJIT_R0 by the value in SLJIT_R1. +- The result is placed in SLJIT_R0 and the remainder goes to SLJIT_R1. +- Note: if SLJIT_R1 contains 0, the behaviour is undefined. */ +-#define SLJIT_LUDIV (SLJIT_OP0_BASE + 4) +-#define SLJIT_ILUDIV (SLJIT_LUDIV | SLJIT_INT_OP) ++ The result is placed into SLJIT_R0 and the remainder into SLJIT_R1. ++ Note: if SLJIT_R1 is 0, the behaviour is undefined. */ ++#define SLJIT_UDIVMOD (SLJIT_OP0_BASE + 4) ++#define SLJIT_IUDIVMOD (SLJIT_UDIVMOD | SLJIT_INT_OP) + /* Flags: I - (may destroy flags) + Signed divide of the value in SLJIT_R0 by the value in SLJIT_R1. +- The result is placed in SLJIT_R0 and the remainder goes to SLJIT_R1. +- Note: if SLJIT_R1 contains 0, the behaviour is undefined. */ +-#define SLJIT_LSDIV (SLJIT_OP0_BASE + 5) +-#define SLJIT_ILSDIV (SLJIT_LSDIV | SLJIT_INT_OP) ++ The result is placed into SLJIT_R0 and the remainder into SLJIT_R1. ++ Note: if SLJIT_R1 is 0, the behaviour is undefined. ++ Note: if SLJIT_R1 is -1 and SLJIT_R0 is integer min (0x800..00), ++ the behaviour is undefined. */ ++#define SLJIT_SDIVMOD (SLJIT_OP0_BASE + 5) ++#define SLJIT_ISDIVMOD (SLJIT_SDIVMOD | SLJIT_INT_OP) ++/* Flags: I - (may destroy flags) ++ Unsigned divide of the value in SLJIT_R0 by the value in SLJIT_R1. ++ The result is placed into SLJIT_R0. SLJIT_R1 preserves its value. ++ Note: if SLJIT_R1 is 0, the behaviour is undefined. ++ Note: SLJIT_SDIV is single precision divide. */ ++#define SLJIT_UDIVI (SLJIT_OP0_BASE + 6) ++#define SLJIT_IUDIVI (SLJIT_UDIVI | SLJIT_INT_OP) ++/* Flags: I - (may destroy flags) ++ Signed divide of the value in SLJIT_R0 by the value in SLJIT_R1. ++ The result is placed into SLJIT_R0. SLJIT_R1 preserves its value. ++ Note: if SLJIT_R1 is 0, the behaviour is undefined. ++ Note: if SLJIT_R1 is -1 and SLJIT_R0 is integer min (0x800..00), ++ the behaviour is undefined. ++ Note: SLJIT_SDIV is single precision divide. */ ++#define SLJIT_SDIVI (SLJIT_OP0_BASE + 7) ++#define SLJIT_ISDIVI (SLJIT_SDIVI | SLJIT_INT_OP) + + SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op); + +@@ -851,34 +869,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler + sljit_si src1, sljit_sw src1w, + sljit_si src2, sljit_sw src2w); + +-/* The following function is a helper function for sljit_emit_op_custom. +- It returns with the real machine register index ( >=0 ) of any SLJIT_R, +- SLJIT_S and SLJIT_SP registers. +- +- Note: it returns with -1 for virtual registers (only on x86-32). */ +- +-SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg); +- +-/* The following function is a helper function for sljit_emit_op_custom. +- It returns with the real machine register index of any SLJIT_FLOAT register. +- +- Note: the index is always an even number on ARM (except ARM-64), MIPS, and SPARC. */ +- +-SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg); +- +-/* Any instruction can be inserted into the instruction stream by +- sljit_emit_op_custom. It has a similar purpose as inline assembly. +- The size parameter must match to the instruction size of the target +- architecture: +- +- x86: 0 < size <= 15. The instruction argument can be byte aligned. +- Thumb2: if size == 2, the instruction argument must be 2 byte aligned. +- if size == 4, the instruction argument must be 4 byte aligned. +- Otherwise: size must be 4 and instruction argument must be 4 byte aligned. */ +- +-SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler, +- void *instruction, sljit_si size); +- + /* Returns with non-zero if fpu is available. */ + + SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_is_fpu_available(void); +@@ -1196,4 +1186,64 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_set_function_context(void** func_ptr, struct + + #endif /* !(defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL) */ + ++/* --------------------------------------------------------------------- */ ++/* CPU specific functions */ ++/* --------------------------------------------------------------------- */ ++ ++/* The following function is a helper function for sljit_emit_op_custom. ++ It returns with the real machine register index ( >=0 ) of any SLJIT_R, ++ SLJIT_S and SLJIT_SP registers. ++ ++ Note: it returns with -1 for virtual registers (only on x86-32). */ ++ ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg); ++ ++/* The following function is a helper function for sljit_emit_op_custom. ++ It returns with the real machine register index of any SLJIT_FLOAT register. ++ ++ Note: the index is always an even number on ARM (except ARM-64), MIPS, and SPARC. */ ++ ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg); ++ ++/* Any instruction can be inserted into the instruction stream by ++ sljit_emit_op_custom. It has a similar purpose as inline assembly. ++ The size parameter must match to the instruction size of the target ++ architecture: ++ ++ x86: 0 < size <= 15. The instruction argument can be byte aligned. ++ Thumb2: if size == 2, the instruction argument must be 2 byte aligned. ++ if size == 4, the instruction argument must be 4 byte aligned. ++ Otherwise: size must be 4 and instruction argument must be 4 byte aligned. */ ++ ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler, ++ void *instruction, sljit_si size); ++ ++#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) ++ ++/* Returns with non-zero if sse2 is available. */ ++ ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_sse2_available(void); ++ ++/* Returns with non-zero if cmov instruction is available. */ ++ ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_cmov_available(void); ++ ++/* Emit a conditional mov instruction on x86 CPUs. This instruction ++ moves src to destination, if the condition is satisfied. Unlike ++ other arithmetic instructions, destination must be a register. ++ Before such instructions are emitted, cmov support should be ++ checked by sljit_x86_is_cmov_available function. ++ type must be between SLJIT_EQUAL and SLJIT_S_ORDERED ++ dst_reg must be a valid register and it can be combined ++ with SLJIT_INT_OP to perform 32 bit arithmetic ++ Flags: I - (never set any flags) ++ */ ++ ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_emit_cmov(struct sljit_compiler *compiler, ++ sljit_si type, ++ sljit_si dst_reg, ++ sljit_si src, sljit_sw srcw); ++ ++#endif ++ + #endif /* _SLJIT_LIR_H_ */ +diff --git a/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c b/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c +index aca1d31..5cd4c71 100644 +--- a/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c ++++ b/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c +@@ -1833,18 +1833,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + | (reg_map[SLJIT_R0] << 8) + | reg_map[TMP_REG1]); + #endif +- case SLJIT_LUDIV: +- case SLJIT_LSDIV: +- if (compiler->scratches >= 3) ++ case SLJIT_UDIVMOD: ++ case SLJIT_SDIVMOD: ++ case SLJIT_UDIVI: ++ case SLJIT_SDIVI: ++ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments); ++ SLJIT_COMPILE_ASSERT(reg_map[2] == 1 && reg_map[3] == 2, bad_register_mapping); ++ ++ if ((op >= SLJIT_UDIVI) && (compiler->scratches >= 3)) { + FAIL_IF(push_inst(compiler, 0xe52d2008 /* str r2, [sp, #-8]! */)); ++ FAIL_IF(push_inst(compiler, 0xe58d1004 /* str r1, [sp, #4] */)); ++ } ++ else if ((op >= SLJIT_UDIVI) || (compiler->scratches >= 3)) ++ FAIL_IF(push_inst(compiler, 0xe52d0008 | (op >= SLJIT_UDIVI ? 0x1000 : 0x2000) /* str r1/r2, [sp, #-8]! */)); ++ + #if defined(__GNUC__) + FAIL_IF(sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM, +- (op == SLJIT_LUDIV ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod)))); ++ ((op | 0x2) == SLJIT_UDIVI ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod)))); + #else + #error "Software divmod functions are needed" + #endif +- if (compiler->scratches >= 3) +- return push_inst(compiler, 0xe49d2008 /* ldr r2, [sp], #8 */); ++ ++ if ((op >= SLJIT_UDIVI) && (compiler->scratches >= 3)) { ++ FAIL_IF(push_inst(compiler, 0xe59d1004 /* ldr r1, [sp, #4] */)); ++ FAIL_IF(push_inst(compiler, 0xe49d2008 /* ldr r2, [sp], #8 */)); ++ } ++ else if ((op >= SLJIT_UDIVI) || (compiler->scratches >= 3)) ++ return push_inst(compiler, 0xe49d0008 | (op >= SLJIT_UDIVI ? 0x1000 : 0x2000) /* ldr r1/r2, [sp], #8 */); + return SLJIT_SUCCESS; + } + +diff --git a/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c b/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c +index b66455f..044a675 100644 +--- a/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c ++++ b/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c +@@ -1087,14 +1087,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil + saved_regs_size += sizeof(sljit_sw); + } + local_size -= saved_regs_size + SLJIT_LOCALS_OFFSET; +- FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10))); ++ if (saved_regs_size > 0) ++ FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10))); + } + + tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG; + prev = -1; + for (i = SLJIT_S0; i >= tmp; i--) { + if (prev == -1) { +- prev = i; ++ if (!(offs & (1 << 15))) { ++ prev = i; ++ continue; ++ } ++ FAIL_IF(push_inst(compiler, STRI | RT(i) | RN(TMP_SP) | (offs >> 5))); ++ offs += 1 << 15; + continue; + } + FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(TMP_SP) | offs)); +@@ -1104,7 +1110,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil + + for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) { + if (prev == -1) { +- prev = i; ++ if (!(offs & (1 << 15))) { ++ prev = i; ++ continue; ++ } ++ FAIL_IF(push_inst(compiler, STRI | RT(i) | RN(TMP_SP) | (offs >> 5))); ++ offs += 1 << 15; + continue; + } + FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(TMP_SP) | offs)); +@@ -1112,8 +1123,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil + prev = -1; + } + +- if (prev != -1) +- FAIL_IF(push_inst(compiler, STRI | RT(prev) | RN(TMP_SP) | (offs >> 5))); ++ SLJIT_ASSERT(prev == -1); + + if (compiler->local_size > (63 * sizeof(sljit_sw))) { + /* The local_size is already adjusted by the saved registers. */ +@@ -1188,7 +1198,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi + prev = -1; + for (i = SLJIT_S0; i >= tmp; i--) { + if (prev == -1) { +- prev = i; ++ if (!(offs & (1 << 15))) { ++ prev = i; ++ continue; ++ } ++ FAIL_IF(push_inst(compiler, LDRI | RT(i) | RN(TMP_SP) | (offs >> 5))); ++ offs += 1 << 15; + continue; + } + FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(TMP_SP) | offs)); +@@ -1198,7 +1213,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi + + for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--) { + if (prev == -1) { +- prev = i; ++ if (!(offs & (1 << 15))) { ++ prev = i; ++ continue; ++ } ++ FAIL_IF(push_inst(compiler, LDRI | RT(i) | RN(TMP_SP) | (offs >> 5))); ++ offs += 1 << 15; + continue; + } + FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(TMP_SP) | offs)); +@@ -1206,13 +1226,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi + prev = -1; + } + +- if (prev != -1) +- FAIL_IF(push_inst(compiler, LDRI | RT(prev) | RN(TMP_SP) | (offs >> 5))); ++ SLJIT_ASSERT(prev == -1); + + if (compiler->local_size <= (63 * sizeof(sljit_sw))) { + FAIL_IF(push_inst(compiler, LDP_PST | 29 | RT2(TMP_LR) + | RN(TMP_SP) | (((local_size >> 3) & 0x7f) << 15))); +- } else { ++ } else if (saved_regs_size > 0) { + FAIL_IF(push_inst(compiler, ADDI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10))); + } + +@@ -1242,12 +1261,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + FAIL_IF(push_inst(compiler, ORR | RD(TMP_REG1) | RN(TMP_ZERO) | RM(SLJIT_R0))); + FAIL_IF(push_inst(compiler, MADD | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1) | RT2(TMP_ZERO))); + return push_inst(compiler, (op == SLJIT_LUMUL ? UMULH : SMULH) | RD(SLJIT_R1) | RN(TMP_REG1) | RM(SLJIT_R1)); +- case SLJIT_LUDIV: +- case SLJIT_LSDIV: ++ case SLJIT_UDIVMOD: ++ case SLJIT_SDIVMOD: + FAIL_IF(push_inst(compiler, (ORR ^ inv_bits) | RD(TMP_REG1) | RN(TMP_ZERO) | RM(SLJIT_R0))); +- FAIL_IF(push_inst(compiler, ((op == SLJIT_LUDIV ? UDIV : SDIV) ^ inv_bits) | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1))); ++ FAIL_IF(push_inst(compiler, ((op == SLJIT_UDIVMOD ? UDIV : SDIV) ^ inv_bits) | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1))); + FAIL_IF(push_inst(compiler, (MADD ^ inv_bits) | RD(SLJIT_R1) | RN(SLJIT_R0) | RM(SLJIT_R1) | RT2(TMP_ZERO))); + return push_inst(compiler, (SUB ^ inv_bits) | RD(SLJIT_R1) | RN(TMP_REG1) | RM(SLJIT_R1)); ++ case SLJIT_UDIVI: ++ case SLJIT_SDIVI: ++ return push_inst(compiler, ((op == SLJIT_UDIVI ? UDIV : SDIV) ^ inv_bits) | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1)); + } + + return SLJIT_SUCCESS; +diff --git a/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c b/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c +index 6e38cec..f9803f5 100644 +--- a/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c ++++ b/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c +@@ -1239,6 +1239,9 @@ extern int __aeabi_idivmod(int numerator, int denominator); + + SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op) + { ++ sljit_sw saved_reg_list[3]; ++ sljit_sw saved_reg_count; ++ + CHECK_ERROR(); + CHECK(check_sljit_emit_op0(compiler, op)); + +@@ -1255,24 +1258,53 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + | (reg_map[SLJIT_R0] << 12) + | (reg_map[SLJIT_R0] << 16) + | reg_map[SLJIT_R1]); +- case SLJIT_LUDIV: +- case SLJIT_LSDIV: +- if (compiler->scratches >= 4) { +- FAIL_IF(push_inst32(compiler, 0xf84d2d04 /* str r2, [sp, #-4]! */)); +- FAIL_IF(push_inst32(compiler, 0xf84dcd04 /* str ip, [sp, #-4]! */)); +- } else if (compiler->scratches >= 3) +- FAIL_IF(push_inst32(compiler, 0xf84d2d08 /* str r2, [sp, #-8]! */)); ++ case SLJIT_UDIVMOD: ++ case SLJIT_SDIVMOD: ++ case SLJIT_UDIVI: ++ case SLJIT_SDIVI: ++ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments); ++ SLJIT_COMPILE_ASSERT(reg_map[2] == 1 && reg_map[3] == 2 && reg_map[4] == 12, bad_register_mapping); ++ ++ saved_reg_count = 0; ++ if (compiler->scratches >= 4) ++ saved_reg_list[saved_reg_count++] = 12; ++ if (compiler->scratches >= 3) ++ saved_reg_list[saved_reg_count++] = 2; ++ if (op >= SLJIT_UDIVI) ++ saved_reg_list[saved_reg_count++] = 1; ++ ++ if (saved_reg_count > 0) { ++ FAIL_IF(push_inst32(compiler, 0xf84d0d00 | (saved_reg_count >= 3 ? 16 : 8) ++ | (saved_reg_list[0] << 12) /* str rX, [sp, #-8/-16]! */)); ++ if (saved_reg_count >= 2) { ++ SLJIT_ASSERT(saved_reg_list[1] < 8); ++ FAIL_IF(push_inst16(compiler, 0x9001 | (saved_reg_list[1] << 8) /* str rX, [sp, #4] */)); ++ } ++ if (saved_reg_count >= 3) { ++ SLJIT_ASSERT(saved_reg_list[2] < 8); ++ FAIL_IF(push_inst16(compiler, 0x9002 | (saved_reg_list[2] << 8) /* str rX, [sp, #8] */)); ++ } ++ } ++ + #if defined(__GNUC__) + FAIL_IF(sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM, +- (op == SLJIT_LUDIV ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod)))); ++ ((op | 0x2) == SLJIT_UDIVI ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod)))); + #else + #error "Software divmod functions are needed" + #endif +- if (compiler->scratches >= 4) { +- FAIL_IF(push_inst32(compiler, 0xf85dcb04 /* ldr ip, [sp], #4 */)); +- return push_inst32(compiler, 0xf85d2b04 /* ldr r2, [sp], #4 */); +- } else if (compiler->scratches >= 3) +- return push_inst32(compiler, 0xf85d2b08 /* ldr r2, [sp], #8 */); ++ ++ if (saved_reg_count > 0) { ++ if (saved_reg_count >= 3) { ++ SLJIT_ASSERT(saved_reg_list[2] < 8); ++ FAIL_IF(push_inst16(compiler, 0x9802 | (saved_reg_list[2] << 8) /* ldr rX, [sp, #8] */)); ++ } ++ if (saved_reg_count >= 2) { ++ SLJIT_ASSERT(saved_reg_list[1] < 8); ++ FAIL_IF(push_inst16(compiler, 0x9801 | (saved_reg_list[1] << 8) /* ldr rX, [sp, #4] */)); ++ } ++ return push_inst32(compiler, 0xf85d0b00 | (saved_reg_count >= 3 ? 16 : 8) ++ | (saved_reg_list[0] << 12) /* ldr rX, [sp], #8/16 */); ++ } + return SLJIT_SUCCESS; + } + +diff --git a/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c b/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c +index 3e2c9f0..cf3535f 100644 +--- a/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c ++++ b/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c +@@ -1053,8 +1053,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + #endif + FAIL_IF(push_inst(compiler, MFLO | D(SLJIT_R0), DR(SLJIT_R0))); + return push_inst(compiler, MFHI | D(SLJIT_R1), DR(SLJIT_R1)); +- case SLJIT_LUDIV: +- case SLJIT_LSDIV: ++ case SLJIT_UDIVMOD: ++ case SLJIT_SDIVMOD: ++ case SLJIT_UDIVI: ++ case SLJIT_SDIVI: ++ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments); + #if !(defined SLJIT_MIPS_R1 && SLJIT_MIPS_R1) + FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS)); + FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS)); +@@ -1062,15 +1065,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + + #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64) + if (int_op) +- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS)); ++ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS)); + else +- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DDIVU : DDIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS)); ++ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? DDIVU : DDIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS)); + #else +- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS)); ++ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS)); + #endif + + FAIL_IF(push_inst(compiler, MFLO | D(SLJIT_R0), DR(SLJIT_R0))); +- return push_inst(compiler, MFHI | D(SLJIT_R1), DR(SLJIT_R1)); ++ return (op >= SLJIT_UDIVI) ? SLJIT_SUCCESS : push_inst(compiler, MFHI | D(SLJIT_R1), DR(SLJIT_R1)); + } + + return SLJIT_SUCCESS; +diff --git a/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c b/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c +index 08d5356..b6a043f 100644 +--- a/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c ++++ b/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c +@@ -1267,22 +1267,23 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + FAIL_IF(push_inst(compiler, MULLW | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1))); + return push_inst(compiler, (op == SLJIT_LUMUL ? MULHWU : MULHW) | D(SLJIT_R1) | A(TMP_REG1) | B(SLJIT_R1)); + #endif +- case SLJIT_LUDIV: +- case SLJIT_LSDIV: ++ case SLJIT_UDIVMOD: ++ case SLJIT_SDIVMOD: + FAIL_IF(push_inst(compiler, OR | S(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R0))); + #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) +- if (int_op) { +- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVWU : DIVW) | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1))); +- FAIL_IF(push_inst(compiler, MULLW | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1))); +- } else { +- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVDU : DIVD) | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1))); +- FAIL_IF(push_inst(compiler, MULLD | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1))); +- } +- return push_inst(compiler, SUBF | D(SLJIT_R1) | A(SLJIT_R1) | B(TMP_REG1)); ++ FAIL_IF(push_inst(compiler, (int_op ? (op == SLJIT_UDIVMOD ? DIVWU : DIVW) : (op == SLJIT_UDIVMOD ? DIVDU : DIVD)) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1))); ++ FAIL_IF(push_inst(compiler, (int_op ? MULLW : MULLD) | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1))); + #else +- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVWU : DIVW) | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1))); ++ FAIL_IF(push_inst(compiler, (op == SLJIT_UDIVMOD ? DIVWU : DIVW) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1))); + FAIL_IF(push_inst(compiler, MULLW | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1))); ++#endif + return push_inst(compiler, SUBF | D(SLJIT_R1) | A(SLJIT_R1) | B(TMP_REG1)); ++ case SLJIT_UDIVI: ++ case SLJIT_SDIVI: ++#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) ++ return push_inst(compiler, (int_op ? (op == SLJIT_UDIVI ? DIVWU : DIVW) : (op == SLJIT_UDIVI ? DIVDU : DIVD)) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1)); ++#else ++ return push_inst(compiler, (op == SLJIT_UDIVI ? DIVWU : DIVW) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1)); + #endif + } + +diff --git a/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c b/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c +index 0b1927a..327c426 100644 +--- a/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c ++++ b/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c +@@ -777,20 +777,25 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + #else + #error "Implementation required" + #endif +- case SLJIT_LUDIV: +- case SLJIT_LSDIV: ++ case SLJIT_UDIVMOD: ++ case SLJIT_SDIVMOD: ++ case SLJIT_UDIVI: ++ case SLJIT_SDIVI: ++ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments); + #if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32) +- if (op == SLJIT_LUDIV) ++ if ((op | 0x2) == SLJIT_UDIVI) + FAIL_IF(push_inst(compiler, WRY | S1(0), MOVABLE_INS)); + else { + FAIL_IF(push_inst(compiler, SRA | D(TMP_REG1) | S1(SLJIT_R0) | IMM(31), DR(TMP_REG1))); + FAIL_IF(push_inst(compiler, WRY | S1(TMP_REG1), MOVABLE_INS)); + } +- FAIL_IF(push_inst(compiler, OR | D(TMP_REG2) | S1(0) | S2(SLJIT_R0), DR(TMP_REG2))); +- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? UDIV : SDIV) | D(SLJIT_R0) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R0))); ++ if (op <= SLJIT_SDIVMOD) ++ FAIL_IF(push_inst(compiler, OR | D(TMP_REG2) | S1(0) | S2(SLJIT_R0), DR(TMP_REG2))); ++ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? UDIV : SDIV) | D(SLJIT_R0) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R0))); ++ if (op >= SLJIT_UDIVI) ++ return SLJIT_SUCCESS; + FAIL_IF(push_inst(compiler, SMUL | D(SLJIT_R1) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R1))); +- FAIL_IF(push_inst(compiler, SUB | D(SLJIT_R1) | S1(TMP_REG2) | S2(SLJIT_R1), DR(SLJIT_R1))); +- return SLJIT_SUCCESS; ++ return push_inst(compiler, SUB | D(SLJIT_R1) | S1(TMP_REG2) | S2(SLJIT_R1), DR(SLJIT_R1)); + #else + #error "Implementation required" + #endif +diff --git a/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c b/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c +index 1d6aa5a..4d40392f 100644 +--- a/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c ++++ b/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c +@@ -35,21 +35,21 @@ + #define SIMM_16BIT_MIN (-0x8000) + #define SIMM_17BIT_MAX (0xffff) + #define SIMM_17BIT_MIN (-0x10000) +-#define SIMM_32BIT_MIN (-0x80000000) + #define SIMM_32BIT_MAX (0x7fffffff) +-#define SIMM_48BIT_MIN (0x800000000000L) ++#define SIMM_32BIT_MIN (-0x7fffffff - 1) + #define SIMM_48BIT_MAX (0x7fffffff0000L) ++#define SIMM_48BIT_MIN (-0x800000000000L) + #define IMM16(imm) ((imm) & 0xffff) + + #define UIMM_16BIT_MAX (0xffff) + +-#define TMP_REG1 (SLJIT_NO_REGISTERS + 1) +-#define TMP_REG2 (SLJIT_NO_REGISTERS + 2) +-#define TMP_REG3 (SLJIT_NO_REGISTERS + 3) +-#define ADDR_TMP (SLJIT_NO_REGISTERS + 4) ++#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2) ++#define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3) ++#define TMP_REG3 (SLJIT_NUMBER_OF_REGISTERS + 4) ++#define ADDR_TMP (SLJIT_NUMBER_OF_REGISTERS + 5) + #define PIC_ADDR_REG TMP_REG2 + +-static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 5] = { ++static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 6] = { + 63, 0, 1, 2, 3, 4, 30, 31, 32, 33, 34, 54, 5, 16, 6, 7 + }; + +@@ -58,11 +58,6 @@ static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 5] = { + #define TMP_REG2_mapped 16 + #define TMP_REG3_mapped 6 + #define ADDR_TMP_mapped 7 +-#define SLJIT_SAVED_REG1_mapped 30 +-#define SLJIT_SAVED_REG2_mapped 31 +-#define SLJIT_SAVED_REG3_mapped 32 +-#define SLJIT_SAVED_EREG1_mapped 33 +-#define SLJIT_SAVED_EREG2_mapped 34 + + /* Flags are keept in volatile registers. */ + #define EQUAL_FLAG 8 +@@ -399,6 +394,9 @@ static sljit_si push_inst(struct sljit_compiler *compiler, sljit_ins ins) + #define SUB(dst, srca, srcb) \ + push_3_buffer(compiler, TILEGX_OPC_SUB, dst, srca, srcb, __LINE__) + ++#define MUL(dst, srca, srcb) \ ++ push_3_buffer(compiler, TILEGX_OPC_MULX, dst, srca, srcb, __LINE__) ++ + #define NOR(dst, srca, srcb) \ + push_3_buffer(compiler, TILEGX_OPC_NOR, dst, srca, srcb, __LINE__) + +@@ -547,8 +545,8 @@ const struct Format* compute_format() + + const struct Format* match = NULL; + const struct Format *b = NULL; +- unsigned int i = 0; +- for (i; i < sizeof formats / sizeof formats[0]; i++) { ++ unsigned int i; ++ for (i = 0; i < sizeof formats / sizeof formats[0]; i++) { + b = &formats[i]; + if ((b->pipe_mask & compatible_pipes) == b->pipe_mask) { + match = b; +@@ -625,7 +623,6 @@ tilegx_bundle_bits get_bundle_bit(struct jit_instr *inst) + + static sljit_si update_buffer(struct sljit_compiler *compiler) + { +- int count; + int i; + int orig_index = inst_buf_index; + struct jit_instr inst0 = inst_buf[0]; +@@ -738,8 +735,10 @@ static sljit_si update_buffer(struct sljit_compiler *compiler) + + static sljit_si flush_buffer(struct sljit_compiler *compiler) + { +- while (inst_buf_index != 0) +- update_buffer(compiler); ++ while (inst_buf_index != 0) { ++ FAIL_IF(update_buffer(compiler)); ++ } ++ return SLJIT_SUCCESS; + } + + static sljit_si push_4_buffer(struct sljit_compiler *compiler, tilegx_mnemonic opc, int op0, int op1, int op2, int op3, int line) +@@ -787,6 +786,7 @@ static sljit_si push_3_buffer(struct sljit_compiler *compiler, tilegx_mnemonic o + case TILEGX_OPC_ADD: + case TILEGX_OPC_AND: + case TILEGX_OPC_SUB: ++ case TILEGX_OPC_MULX: + case TILEGX_OPC_OR: + case TILEGX_OPC_XOR: + case TILEGX_OPC_NOR: +@@ -905,7 +905,6 @@ static SLJIT_INLINE sljit_ins * detect_jump_type(struct sljit_jump *jump, sljit_ + sljit_sw diff; + sljit_uw target_addr; + sljit_ins *inst; +- sljit_ins saved_inst; + + if (jump->flags & SLJIT_REWRITABLE_JUMP) + return code_ptr; +@@ -1009,7 +1008,7 @@ SLJIT_API_FUNC_ATTRIBUTE void * sljit_generate_code(struct sljit_compiler *compi + struct sljit_const *const_; + + CHECK_ERROR_PTR(); +- check_sljit_generate_code(compiler); ++ CHECK_PTR(check_sljit_generate_code(compiler)); + reverse_buf(compiler); + + code = (sljit_ins *)SLJIT_MALLOC_EXEC(compiler->size * sizeof(sljit_ins)); +@@ -1178,13 +1177,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil + sljit_si fscratches, sljit_si fsaveds, sljit_si local_size) + { + sljit_ins base; +- sljit_ins bundle = 0; +- ++ sljit_si i, tmp; ++ + CHECK_ERROR(); +- check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size); ++ CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size)); + set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size); + +- local_size += (saveds + 1) * sizeof(sljit_sw); ++ local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1); + local_size = (local_size + 7) & ~7; + compiler->local_size = local_size; + +@@ -1200,56 +1199,52 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil + local_size = 0; + } + ++ /* Save the return address. */ + FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 8)); + FAIL_IF(ST_ADD(ADDR_TMP_mapped, RA, -8)); + +- if (saveds >= 1) +- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_REG1_mapped, -8)); +- +- if (saveds >= 2) +- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_REG2_mapped, -8)); +- +- if (saveds >= 3) +- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_REG3_mapped, -8)); +- +- if (saveds >= 4) +- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_EREG1_mapped, -8)); +- +- if (saveds >= 5) +- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_EREG2_mapped, -8)); +- +- if (args >= 1) +- FAIL_IF(ADD(SLJIT_SAVED_REG1_mapped, 0, ZERO)); ++ /* Save the S registers. */ ++ tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG; ++ for (i = SLJIT_S0; i >= tmp; i--) { ++ FAIL_IF(ST_ADD(ADDR_TMP_mapped, reg_map[i], -8)); ++ } + +- if (args >= 2) +- FAIL_IF(ADD(SLJIT_SAVED_REG2_mapped, 1, ZERO)); ++ /* Save the R registers that need to be reserved. */ ++ for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) { ++ FAIL_IF(ST_ADD(ADDR_TMP_mapped, reg_map[i], -8)); ++ } + +- if (args >= 3) +- FAIL_IF(ADD(SLJIT_SAVED_REG3_mapped, 2, ZERO)); ++ /* Move the arguments to S registers. */ ++ for (i = 0; i < args; i++) { ++ FAIL_IF(ADD(reg_map[SLJIT_S0 - i], i, ZERO)); ++ } + + return SLJIT_SUCCESS; + } + +-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_context(struct sljit_compiler *compiler, ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_set_context(struct sljit_compiler *compiler, + sljit_si options, sljit_si args, sljit_si scratches, sljit_si saveds, + sljit_si fscratches, sljit_si fsaveds, sljit_si local_size) + { +- CHECK_ERROR_VOID(); +- check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size); ++ CHECK_ERROR(); ++ CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size)); + set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size); + +- local_size += (saveds + 1) * sizeof(sljit_sw); ++ local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1); + compiler->local_size = (local_size + 7) & ~7; ++ ++ return SLJIT_SUCCESS; + } + + SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compiler, sljit_si op, sljit_si src, sljit_sw srcw) + { + sljit_si local_size; + sljit_ins base; +- int addr_initialized = 0; ++ sljit_si i, tmp; ++ sljit_si saveds; + + CHECK_ERROR(); +- check_sljit_emit_return(compiler, op, src, srcw); ++ CHECK(check_sljit_emit_return(compiler, op, src, srcw)); + + FAIL_IF(emit_mov_before_return(compiler, op, src, srcw)); + +@@ -1263,50 +1258,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi + local_size = 0; + } + ++ /* Restore the return address. */ + FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 8)); +- FAIL_IF(LD(RA, ADDR_TMP_mapped)); +- +- if (compiler->saveds >= 5) { +- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 48)); +- addr_initialized = 1; ++ FAIL_IF(LD_ADD(RA, ADDR_TMP_mapped, -8)); + +- FAIL_IF(LD_ADD(SLJIT_SAVED_EREG2_mapped, ADDR_TMP_mapped, 8)); ++ /* Restore the S registers. */ ++ saveds = compiler->saveds; ++ tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG; ++ for (i = SLJIT_S0; i >= tmp; i--) { ++ FAIL_IF(LD_ADD(reg_map[i], ADDR_TMP_mapped, -8)); + } + +- if (compiler->saveds >= 4) { +- if (addr_initialized == 0) { +- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 40)); +- addr_initialized = 1; +- } +- +- FAIL_IF(LD_ADD(SLJIT_SAVED_EREG1_mapped, ADDR_TMP_mapped, 8)); +- } +- +- if (compiler->saveds >= 3) { +- if (addr_initialized == 0) { +- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 32)); +- addr_initialized = 1; +- } +- +- FAIL_IF(LD_ADD(SLJIT_SAVED_REG3_mapped, ADDR_TMP_mapped, 8)); +- } +- +- if (compiler->saveds >= 2) { +- if (addr_initialized == 0) { +- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 24)); +- addr_initialized = 1; +- } +- +- FAIL_IF(LD_ADD(SLJIT_SAVED_REG2_mapped, ADDR_TMP_mapped, 8)); +- } +- +- if (compiler->saveds >= 1) { +- if (addr_initialized == 0) { +- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 16)); +- /* addr_initialized = 1; no need to initialize as it's the last one. */ +- } +- +- FAIL_IF(LD_ADD(SLJIT_SAVED_REG1_mapped, ADDR_TMP_mapped, 8)); ++ /* Restore the R registers that need to be reserved. */ ++ for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--) { ++ FAIL_IF(LD_ADD(reg_map[i], ADDR_TMP_mapped, -8)); + } + + if (compiler->local_size <= SIMM_16BIT_MAX) +@@ -1585,7 +1550,7 @@ static SLJIT_INLINE sljit_si emit_op_mem2(struct sljit_compiler *compiler, sljit + SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw) + { + CHECK_ERROR(); +- check_sljit_emit_fast_enter(compiler, dst, dstw); ++ CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw)); + ADJUST_LOCAL_OFFSET(dst, dstw); + + /* For UNUSED dst. Uncommon, but possible. */ +@@ -1602,7 +1567,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_enter(struct sljit_compiler *c + SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_si src, sljit_sw srcw) + { + CHECK_ERROR(); +- check_sljit_emit_fast_return(compiler, src, srcw); ++ CHECK(check_sljit_emit_fast_return(compiler, src, srcw)); + ADJUST_LOCAL_OFFSET(src, srcw); + + if (FAST_IS_REG(src)) +@@ -1636,9 +1601,11 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj + if (op == SLJIT_MOV_SI) + return BFEXTS(reg_map[dst], reg_map[src2], 0, 31); + +- return BFEXTU(reg_map[dst], reg_map[src2], 0, 31); +- } else if (dst != src2) +- SLJIT_ASSERT_STOP(); ++ return BFEXTU(reg_map[dst], reg_map[src2], 0, 31); ++ } else if (dst != src2) { ++ SLJIT_ASSERT(src2 == 0); ++ return ADD(reg_map[dst], reg_map[src2], ZERO); ++ } + + return SLJIT_SUCCESS; + +@@ -1650,8 +1617,10 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj + return BFEXTS(reg_map[dst], reg_map[src2], 0, 7); + + return BFEXTU(reg_map[dst], reg_map[src2], 0, 7); +- } else if (dst != src2) +- SLJIT_ASSERT_STOP(); ++ } else if (dst != src2) { ++ SLJIT_ASSERT(src2 == 0); ++ return ADD(reg_map[dst], reg_map[src2], ZERO); ++ } + + return SLJIT_SUCCESS; + +@@ -1663,8 +1632,10 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj + return BFEXTS(reg_map[dst], reg_map[src2], 0, 15); + + return BFEXTU(reg_map[dst], reg_map[src2], 0, 15); +- } else if (dst != src2) +- SLJIT_ASSERT_STOP(); ++ } else if (dst != src2) { ++ SLJIT_ASSERT(src2 == 0); ++ return ADD(reg_map[dst], reg_map[src2], ZERO); ++ } + + return SLJIT_SUCCESS; + +@@ -1811,7 +1782,6 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj + else { + /* Rare ocasion. */ + FAIL_IF(ADD(TMP_EREG2, reg_map[src1], ZERO)); +- + overflow_ra = TMP_EREG2; + } + } +@@ -1903,6 +1873,17 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj + + return SLJIT_SUCCESS; + ++ case SLJIT_MUL: ++ if (flags & SRC2_IMM) { ++ FAIL_IF(load_immediate(compiler, TMP_REG2_mapped, src2)); ++ src2 = TMP_REG2; ++ flags &= ~SRC2_IMM; ++ } ++ ++ FAIL_IF(MUL(reg_map[dst], reg_map[src1], reg_map[src2])); ++ ++ return SLJIT_SUCCESS; ++ + #define EMIT_LOGICAL(op_imm, op_norm) \ + if (flags & SRC2_IMM) { \ + FAIL_IF(load_immediate(compiler, ADDR_TMP_mapped, src2)); \ +@@ -1950,8 +1931,8 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj + } else { \ + if (op & SLJIT_SET_E) \ + FAIL_IF(push_3_buffer( \ +- compiler, op_imm, reg_map[dst], reg_map[src1], \ +- src2 & 0x3F, __LINE__)); \ ++ compiler, op_norm, EQUAL_FLAG, reg_map[src1], \ ++ reg_map[src2], __LINE__)); \ + if (CHECK_FLAGS(SLJIT_SET_E)) \ + FAIL_IF(push_3_buffer( \ + compiler, op_norm, reg_map[dst], reg_map[src1], \ +@@ -2105,66 +2086,61 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *com + { + sljit_si sugg_dst_ar, dst_ar; + sljit_si flags = GET_ALL_FLAGS(op); ++ sljit_si mem_type = (op & SLJIT_INT_OP) ? (INT_DATA | SIGNED_DATA) : WORD_DATA; + + CHECK_ERROR(); +- check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type); ++ CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type)); + ADJUST_LOCAL_OFFSET(dst, dstw); + + if (dst == SLJIT_UNUSED) + return SLJIT_SUCCESS; + + op = GET_OPCODE(op); ++ if (op == SLJIT_MOV_SI || op == SLJIT_MOV_UI) ++ mem_type = INT_DATA | SIGNED_DATA; + sugg_dst_ar = reg_map[(op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2]; + + compiler->cache_arg = 0; + compiler->cache_argw = 0; + if (op >= SLJIT_ADD && (src & SLJIT_MEM)) { + ADJUST_LOCAL_OFFSET(src, srcw); +- FAIL_IF(emit_op_mem2(compiler, WORD_DATA | LOAD_DATA, TMP_REG1_mapped, src, srcw, dst, dstw)); ++ FAIL_IF(emit_op_mem2(compiler, mem_type | LOAD_DATA, TMP_REG1_mapped, src, srcw, dst, dstw)); + src = TMP_REG1; + srcw = 0; + } + +- switch (type) { +- case SLJIT_C_EQUAL: +- case SLJIT_C_NOT_EQUAL: ++ switch (type & 0xff) { ++ case SLJIT_EQUAL: ++ case SLJIT_NOT_EQUAL: + FAIL_IF(CMPLTUI(sugg_dst_ar, EQUAL_FLAG, 1)); + dst_ar = sugg_dst_ar; + break; +- case SLJIT_C_LESS: +- case SLJIT_C_GREATER_EQUAL: +- case SLJIT_C_FLOAT_LESS: +- case SLJIT_C_FLOAT_GREATER_EQUAL: ++ case SLJIT_LESS: ++ case SLJIT_GREATER_EQUAL: + dst_ar = ULESS_FLAG; + break; +- case SLJIT_C_GREATER: +- case SLJIT_C_LESS_EQUAL: +- case SLJIT_C_FLOAT_GREATER: +- case SLJIT_C_FLOAT_LESS_EQUAL: ++ case SLJIT_GREATER: ++ case SLJIT_LESS_EQUAL: + dst_ar = UGREATER_FLAG; + break; +- case SLJIT_C_SIG_LESS: +- case SLJIT_C_SIG_GREATER_EQUAL: ++ case SLJIT_SIG_LESS: ++ case SLJIT_SIG_GREATER_EQUAL: + dst_ar = LESS_FLAG; + break; +- case SLJIT_C_SIG_GREATER: +- case SLJIT_C_SIG_LESS_EQUAL: ++ case SLJIT_SIG_GREATER: ++ case SLJIT_SIG_LESS_EQUAL: + dst_ar = GREATER_FLAG; + break; +- case SLJIT_C_OVERFLOW: +- case SLJIT_C_NOT_OVERFLOW: ++ case SLJIT_OVERFLOW: ++ case SLJIT_NOT_OVERFLOW: + dst_ar = OVERFLOW_FLAG; + break; +- case SLJIT_C_MUL_OVERFLOW: +- case SLJIT_C_MUL_NOT_OVERFLOW: ++ case SLJIT_MUL_OVERFLOW: ++ case SLJIT_MUL_NOT_OVERFLOW: + FAIL_IF(CMPLTUI(sugg_dst_ar, OVERFLOW_FLAG, 1)); + dst_ar = sugg_dst_ar; + type ^= 0x1; /* Flip type bit for the XORI below. */ + break; +- case SLJIT_C_FLOAT_EQUAL: +- case SLJIT_C_FLOAT_NOT_EQUAL: +- dst_ar = EQUAL_FLAG; +- break; + + default: + SLJIT_ASSERT_STOP(); +@@ -2180,11 +2156,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *com + if (op >= SLJIT_ADD) { + if (TMP_REG2_mapped != dst_ar) + FAIL_IF(ADD(TMP_REG2_mapped, dst_ar, ZERO)); +- return emit_op(compiler, op | flags, CUMULATIVE_OP | LOGICAL_OP | IMM_OP | ALT_KEEP_CACHE, dst, dstw, src, srcw, TMP_REG2, 0); ++ return emit_op(compiler, op | flags, mem_type | CUMULATIVE_OP | LOGICAL_OP | IMM_OP | ALT_KEEP_CACHE, dst, dstw, src, srcw, TMP_REG2, 0); + } + + if (dst & SLJIT_MEM) +- return emit_op_mem(compiler, WORD_DATA, dst_ar, dst, dstw); ++ return emit_op_mem(compiler, mem_type, dst_ar, dst, dstw); + + if (sugg_dst_ar != dst_ar) + return ADD(sugg_dst_ar, dst_ar, ZERO); +@@ -2194,7 +2170,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *com + + SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op) { + CHECK_ERROR(); +- check_sljit_emit_op0(compiler, op); ++ CHECK(check_sljit_emit_op0(compiler, op)); + + op = GET_OPCODE(op); + switch (op) { +@@ -2204,10 +2180,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + case SLJIT_BREAKPOINT: + return PI(BPT); + +- case SLJIT_UMUL: +- case SLJIT_SMUL: +- case SLJIT_UDIV: +- case SLJIT_SDIV: ++ case SLJIT_LUMUL: ++ case SLJIT_LSMUL: ++ case SLJIT_UDIVI: ++ case SLJIT_SDIVI: + SLJIT_ASSERT_STOP(); + } + +@@ -2217,7 +2193,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler, sljit_si op, sljit_si dst, sljit_sw dstw, sljit_si src, sljit_sw srcw) + { + CHECK_ERROR(); +- check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw); ++ CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw)); + ADJUST_LOCAL_OFFSET(dst, dstw); + ADJUST_LOCAL_OFFSET(src, srcw); + +@@ -2273,7 +2249,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler + return emit_op(compiler, SLJIT_SUB | GET_ALL_FLAGS(op), IMM_OP, dst, dstw, SLJIT_IMM, 0, src, srcw); + + case SLJIT_CLZ: +- return emit_op(compiler, op, 0, dst, dstw, TMP_REG1, 0, src, srcw); ++ return emit_op(compiler, op, (op & SLJIT_INT_OP) ? INT_DATA : WORD_DATA, dst, dstw, TMP_REG1, 0, src, srcw); + } + + return SLJIT_SUCCESS; +@@ -2282,7 +2258,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler + SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler, sljit_si op, sljit_si dst, sljit_sw dstw, sljit_si src1, sljit_sw src1w, sljit_si src2, sljit_sw src2w) + { + CHECK_ERROR(); +- check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w); ++ CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w)); + ADJUST_LOCAL_OFFSET(dst, dstw); + ADJUST_LOCAL_OFFSET(src1, src1w); + ADJUST_LOCAL_OFFSET(src2, src2w); +@@ -2325,7 +2301,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_label * sljit_emit_label(struct sljit_comp + flush_buffer(compiler); + + CHECK_ERROR_PTR(); +- check_sljit_emit_label(compiler); ++ CHECK_PTR(check_sljit_emit_label(compiler)); + + if (compiler->last_label && compiler->last_label->size == compiler->size) + return compiler->last_label; +@@ -2344,7 +2320,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compil + flush_buffer(compiler); + + CHECK_ERROR(); +- check_sljit_emit_ijump(compiler, type, src, srcw); ++ CHECK(check_sljit_emit_ijump(compiler, type, src, srcw)); + ADJUST_LOCAL_OFFSET(src, srcw); + + if (FAST_IS_REG(src)) { +@@ -2404,8 +2380,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compil + + return SLJIT_SUCCESS; + +- } else if (src & SLJIT_MEM) ++ } else if (src & SLJIT_MEM) { + FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_REG2, 0, TMP_REG1, 0, src, srcw)); ++ flush_buffer(compiler); ++ } + + FAIL_IF(JR_SOLO(reg_map[src_r])); + +@@ -2432,7 +2410,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump * sljit_emit_jump(struct sljit_compil + flush_buffer(compiler); + + CHECK_ERROR_PTR(); +- check_sljit_emit_jump(compiler, type); ++ CHECK_PTR(check_sljit_emit_jump(compiler, type)); + + jump = (struct sljit_jump *)ensure_abuf(compiler, sizeof(struct sljit_jump)); + PTR_FAIL_IF(!jump); +@@ -2440,48 +2418,42 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump * sljit_emit_jump(struct sljit_compil + type &= 0xff; + + switch (type) { +- case SLJIT_C_EQUAL: +- case SLJIT_C_FLOAT_NOT_EQUAL: ++ case SLJIT_EQUAL: + BR_NZ(EQUAL_FLAG); + break; +- case SLJIT_C_NOT_EQUAL: +- case SLJIT_C_FLOAT_EQUAL: ++ case SLJIT_NOT_EQUAL: + BR_Z(EQUAL_FLAG); + break; +- case SLJIT_C_LESS: +- case SLJIT_C_FLOAT_LESS: ++ case SLJIT_LESS: + BR_Z(ULESS_FLAG); + break; +- case SLJIT_C_GREATER_EQUAL: +- case SLJIT_C_FLOAT_GREATER_EQUAL: ++ case SLJIT_GREATER_EQUAL: + BR_NZ(ULESS_FLAG); + break; +- case SLJIT_C_GREATER: +- case SLJIT_C_FLOAT_GREATER: ++ case SLJIT_GREATER: + BR_Z(UGREATER_FLAG); + break; +- case SLJIT_C_LESS_EQUAL: +- case SLJIT_C_FLOAT_LESS_EQUAL: ++ case SLJIT_LESS_EQUAL: + BR_NZ(UGREATER_FLAG); + break; +- case SLJIT_C_SIG_LESS: ++ case SLJIT_SIG_LESS: + BR_Z(LESS_FLAG); + break; +- case SLJIT_C_SIG_GREATER_EQUAL: ++ case SLJIT_SIG_GREATER_EQUAL: + BR_NZ(LESS_FLAG); + break; +- case SLJIT_C_SIG_GREATER: ++ case SLJIT_SIG_GREATER: + BR_Z(GREATER_FLAG); + break; +- case SLJIT_C_SIG_LESS_EQUAL: ++ case SLJIT_SIG_LESS_EQUAL: + BR_NZ(GREATER_FLAG); + break; +- case SLJIT_C_OVERFLOW: +- case SLJIT_C_MUL_OVERFLOW: ++ case SLJIT_OVERFLOW: ++ case SLJIT_MUL_OVERFLOW: + BR_Z(OVERFLOW_FLAG); + break; +- case SLJIT_C_NOT_OVERFLOW: +- case SLJIT_C_MUL_NOT_OVERFLOW: ++ case SLJIT_NOT_OVERFLOW: ++ case SLJIT_MUL_NOT_OVERFLOW: + BR_NZ(OVERFLOW_FLAG); + break; + default: +@@ -2536,7 +2508,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_const * sljit_emit_const(struct sljit_comp + flush_buffer(compiler); + + CHECK_ERROR_PTR(); +- check_sljit_emit_const(compiler, dst, dstw, init_value); ++ CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value)); + ADJUST_LOCAL_OFFSET(dst, dstw); + + const_ = (struct sljit_const *)ensure_abuf(compiler, sizeof(struct sljit_const)); +@@ -2572,3 +2544,18 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_consta + inst[3] = (inst[3] & ~(0xFFFFL << 43)) | ((new_constant & 0xFFFFL) << 43); + SLJIT_CACHE_FLUSH(inst, inst + 4); + } ++ ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg) ++{ ++ CHECK_REG_INDEX(check_sljit_get_register_index(reg)); ++ return reg_map[reg]; ++} ++ ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler, ++ void *instruction, sljit_si size) ++{ ++ CHECK_ERROR(); ++ CHECK(check_sljit_emit_op_custom(compiler, instruction, size)); ++ return SLJIT_ERR_UNSUPPORTED; ++} ++ +diff --git a/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c b/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c +index 22a163f..416c15a 100644 +--- a/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c ++++ b/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c +@@ -273,7 +273,9 @@ static sljit_si cpu_has_sse2 = -1; + #endif + static sljit_si cpu_has_cmov = -1; + +-#if defined(_MSC_VER) && _MSC_VER >= 1400 ++#ifdef _WIN32_WCE ++#include ++#elif defined(_MSC_VER) && _MSC_VER >= 1400 + #include + #endif + +@@ -742,8 +744,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + break; + case SLJIT_LUMUL: + case SLJIT_LSMUL: +- case SLJIT_LUDIV: +- case SLJIT_LSDIV: ++ case SLJIT_UDIVMOD: ++ case SLJIT_SDIVMOD: ++ case SLJIT_UDIVI: ++ case SLJIT_SDIVI: + compiler->flags_saved = 0; + #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + #ifdef _WIN64 +@@ -761,9 +765,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + #endif + compiler->mode32 = op & SLJIT_INT_OP; + #endif ++ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments); + + op = GET_OPCODE(op); +- if (op == SLJIT_LUDIV) { ++ if ((op | 0x2) == SLJIT_UDIVI) { + #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64) + EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0); + inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0); +@@ -774,7 +779,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + *inst = XOR_r_rm; + } + +- if (op == SLJIT_LSDIV) { ++ if ((op | 0x2) == SLJIT_SDIVI) { + #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64) + EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0); + #endif +@@ -805,10 +810,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + FAIL_IF(!inst); + INC_SIZE(2); + *inst++ = GROUP_F7; +- *inst = MOD_REG | ((op >= SLJIT_LUDIV) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]); ++ *inst = MOD_REG | ((op >= SLJIT_UDIVMOD) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]); + #else + #ifdef _WIN64 +- size = (!compiler->mode32 || op >= SLJIT_LUDIV) ? 3 : 2; ++ size = (!compiler->mode32 || op >= SLJIT_UDIVMOD) ? 3 : 2; + #else + size = (!compiler->mode32) ? 3 : 2; + #endif +@@ -817,11 +822,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + INC_SIZE(size); + #ifdef _WIN64 + if (!compiler->mode32) +- *inst++ = REX_W | ((op >= SLJIT_LUDIV) ? REX_B : 0); +- else if (op >= SLJIT_LUDIV) ++ *inst++ = REX_W | ((op >= SLJIT_UDIVMOD) ? REX_B : 0); ++ else if (op >= SLJIT_UDIVMOD) + *inst++ = REX_B; + *inst++ = GROUP_F7; +- *inst = MOD_REG | ((op >= SLJIT_LUDIV) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]); ++ *inst = MOD_REG | ((op >= SLJIT_UDIVMOD) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]); + #else + if (!compiler->mode32) + *inst++ = REX_W; +@@ -836,15 +841,21 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler + case SLJIT_LSMUL: + *inst |= IMUL; + break; +- case SLJIT_LUDIV: ++ case SLJIT_UDIVMOD: ++ case SLJIT_UDIVI: + *inst |= DIV; + break; +- case SLJIT_LSDIV: ++ case SLJIT_SDIVMOD: ++ case SLJIT_SDIVI: + *inst |= IDIV; + break; + } + #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64) +- EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0); ++ if (op <= SLJIT_SDIVMOD) ++ EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0); ++#else ++ if (op >= SLJIT_UDIVI) ++ EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0); + #endif + break; + } +@@ -1905,60 +1916,62 @@ static sljit_si emit_test_binary(struct sljit_compiler *compiler, + return SLJIT_SUCCESS; + } + +- if (FAST_IS_REG(src1)) { ++ if (!(src1 & SLJIT_IMM)) { + if (src2 & SLJIT_IMM) { + #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (IS_HALFWORD(src2w) || compiler->mode32) { +- inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0); ++ inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w); + FAIL_IF(!inst); + *inst = GROUP_F7; + } + else { + FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w)); +- inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, 0); ++ inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w); + FAIL_IF(!inst); + *inst = TEST_rm_r; + } + #else +- inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0); ++ inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w); + FAIL_IF(!inst); + *inst = GROUP_F7; + #endif ++ return SLJIT_SUCCESS; + } +- else { ++ else if (FAST_IS_REG(src1)) { + inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w); + FAIL_IF(!inst); + *inst = TEST_rm_r; ++ return SLJIT_SUCCESS; + } +- return SLJIT_SUCCESS; + } + +- if (FAST_IS_REG(src2)) { ++ if (!(src2 & SLJIT_IMM)) { + if (src1 & SLJIT_IMM) { + #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (IS_HALFWORD(src1w) || compiler->mode32) { +- inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, 0); ++ inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w); + FAIL_IF(!inst); + *inst = GROUP_F7; + } + else { + FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w)); +- inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, 0); ++ inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w); + FAIL_IF(!inst); + *inst = TEST_rm_r; + } + #else +- inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, 0); ++ inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w); + FAIL_IF(!inst); + *inst = GROUP_F7; + #endif ++ return SLJIT_SUCCESS; + } +- else { ++ else if (FAST_IS_REG(src2)) { + inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w); + FAIL_IF(!inst); + *inst = TEST_rm_r; ++ return SLJIT_SUCCESS; + } +- return SLJIT_SUCCESS; + } + + EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); +@@ -2923,3 +2936,69 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_consta + { + *(sljit_sw*)addr = new_constant; + } ++ ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_sse2_available(void) ++{ ++#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2) ++ if (cpu_has_sse2 == -1) ++ get_cpu_features(); ++ return cpu_has_sse2; ++#else ++ return 1; ++#endif ++} ++ ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_cmov_available(void) ++{ ++ if (cpu_has_cmov == -1) ++ get_cpu_features(); ++ return cpu_has_cmov; ++} ++ ++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_emit_cmov(struct sljit_compiler *compiler, ++ sljit_si type, ++ sljit_si dst_reg, ++ sljit_si src, sljit_sw srcw) ++{ ++ sljit_ub* inst; ++ ++ CHECK_ERROR(); ++#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) ++ CHECK_ARGUMENT(sljit_x86_is_cmov_available()); ++ CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_INT_OP))); ++ CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_D_ORDERED); ++ CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_INT_OP)); ++ FUNCTION_CHECK_SRC(src, srcw); ++#endif ++#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) ++ if (SLJIT_UNLIKELY(!!compiler->verbose)) { ++ fprintf(compiler->verbose, " x86_cmov%s %s%s, ", ++ !(dst_reg & SLJIT_INT_OP) ? "" : ".i", ++ JUMP_PREFIX(type), jump_names[type & 0xff]); ++ sljit_verbose_reg(compiler, dst_reg & ~SLJIT_INT_OP); ++ fprintf(compiler->verbose, ", "); ++ sljit_verbose_param(compiler, src, srcw); ++ fprintf(compiler->verbose, "\n"); ++ } ++#endif ++ ++ ADJUST_LOCAL_OFFSET(src, srcw); ++ CHECK_EXTRA_REGS(src, srcw, (void)0); ++ ++#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) ++ compiler->mode32 = dst_reg & SLJIT_INT_OP; ++#endif ++ dst_reg &= ~SLJIT_INT_OP; ++ ++ if (SLJIT_UNLIKELY(src & SLJIT_IMM)) { ++ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw); ++ src = TMP_REG1; ++ srcw = 0; ++ } ++ ++ inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw); ++ FAIL_IF(!inst); ++ *inst++ = GROUP_0F; ++ *inst = get_jump_code(type & 0xff) - 0x40; ++ return SLJIT_SUCCESS; ++} -- cgit