summaryrefslogtreecommitdiffstats
path: root/pcre838.patch
diff options
context:
space:
mode:
Diffstat (limited to 'pcre838.patch')
-rw-r--r--pcre838.patch6665
1 files changed, 6665 insertions, 0 deletions
diff --git a/pcre838.patch b/pcre838.patch
new file mode 100644
index 0000000..3b3dcdf
--- /dev/null
+++ b/pcre838.patch
@@ -0,0 +1,6665 @@
+Backported from 5.5 for 5.4 by Remi Collet
+
+
+diff -ru php-5.4.45/ext/pcre/pcrelib/config.h php55/php-5.5.31/ext/pcre/pcrelib/config.h
+--- php-5.4.45/ext/pcre/pcrelib/config.h 2015-09-01 22:09:37.000000000 +0200
++++ php-5.5.31/ext/pcre/pcrelib/config.h 2016-01-06 10:36:49.000000000 +0100
+@@ -302,6 +302,8 @@
+ */
+ /* #undef NO_RECURSE */
+
++#define PARENS_NEST_LIMIT 250
++
+ /* Name of package */
+ #define PACKAGE "pcre"
+
+diff -ru php54/php-5.4.45/ext/pcre/pcrelib/pcre_exec.c php55/php-5.5.31/ext/pcre/pcrelib/pcre_exec.c
+--- php-5.4.45/ext/pcre/pcrelib/pcre_exec.c 2015-09-01 22:09:37.000000000 +0200
++++ php-5.5.31/ext/pcre/pcrelib/pcre_exec.c 2016-01-06 10:36:49.000000000 +0100
+@@ -688,7 +688,7 @@
+ #define foc number
+ #define save_mark data
+
+-/* These statements are here to stop the compiler complaining about unitialized
++/* These statements are here to stop the compiler complaining about uninitialized
+ variables. */
+
+ #ifdef SUPPORT_UCP
+
+From ca02d9c2d6f9bea7bf8abe607f1ee9484b1d7b62 Mon Sep 17 00:00:00 2001
+From: Stanislav Malyshev <stas@php.net>
+Date: Sun, 31 Jan 2016 20:33:17 -0800
+Subject: [PATCH] Upgrade bundled PCRE to 8.38
+
+---
+ NEWS | 3 +
+ ext/pcre/pcrelib/ChangeLog | 176 ++
+ ext/pcre/pcrelib/NEWS | 8 +
+ ext/pcre/pcrelib/config.h | 11 +-
+ ext/pcre/pcrelib/doc/pcre.txt | 2130 +++++++++++-----------
+ ext/pcre/pcrelib/pcre.h | 4 +-
+ ext/pcre/pcrelib/pcre_compile.c | 334 +++-
+ ext/pcre/pcrelib/pcre_exec.c | 5 +-
+ ext/pcre/pcrelib/pcre_internal.h | 17 +-
+ ext/pcre/pcrelib/pcre_jit_compile.c | 77 +-
+ ext/pcre/pcrelib/pcre_study.c | 19 +-
+ ext/pcre/pcrelib/pcre_xclass.c | 2 +-
+ ext/pcre/pcrelib/sljit/sljitConfig.h | 9 +
+ ext/pcre/pcrelib/sljit/sljitConfigInternal.h | 13 +-
+ ext/pcre/pcrelib/sljit/sljitLir.c | 10 +-
+ ext/pcre/pcrelib/sljit/sljitLir.h | 128 +-
+ ext/pcre/pcrelib/sljit/sljitNativeARM_32.c | 27 +-
+ ext/pcre/pcrelib/sljit/sljitNativeARM_64.c | 48 +-
+ ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c | 58 +-
+ ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c | 15 +-
+ ext/pcre/pcrelib/sljit/sljitNativePPC_common.c | 23 +-
+ ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c | 19 +-
+ ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c | 311 ++--
+ ext/pcre/pcrelib/sljit/sljitNativeX86_common.c | 129 +-
+ ext/pcre/pcrelib/testdata/grepoutput | 12 +
+ ext/pcre/pcrelib/testdata/testinput1 | 13 +
+ ext/pcre/pcrelib/testdata/testinput11 | 4 +
+ ext/pcre/pcrelib/testdata/testinput12 | 17 +
+ ext/pcre/pcrelib/testdata/testinput14 | 2 +
+ ext/pcre/pcrelib/testdata/testinput17 | 2 +
+ ext/pcre/pcrelib/testdata/testinput2 | 139 ++
+ ext/pcre/pcrelib/testdata/testinput4 | 5 +
+ ext/pcre/pcrelib/testdata/testinput5 | 8 +
+ ext/pcre/pcrelib/testdata/testinput6 | 57 +
+ ext/pcre/pcrelib/testdata/testinput7 | 15 +
+ ext/pcre/pcrelib/testdata/testinput8 | 4 +
+ ext/pcre/pcrelib/testdata/testinputEBC | 3 +
+ ext/pcre/pcrelib/testdata/testoutput1 | 23 +
+ ext/pcre/pcrelib/testdata/testoutput11-16 | 50 +-
+ ext/pcre/pcrelib/testdata/testoutput11-32 | 50 +-
+ ext/pcre/pcrelib/testdata/testoutput11-8 | 50 +-
+ ext/pcre/pcrelib/testdata/testoutput12 | 25 +
+ ext/pcre/pcrelib/testdata/testoutput14 | 2 +
+ ext/pcre/pcrelib/testdata/testoutput17 | 2 +
+ ext/pcre/pcrelib/testdata/testoutput2 | 380 +++-
+ ext/pcre/pcrelib/testdata/testoutput4 | 6 +
+ ext/pcre/pcrelib/testdata/testoutput5 | 45 +
+ ext/pcre/pcrelib/testdata/testoutput6 | 96 +
+ ext/pcre/pcrelib/testdata/testoutput7 | 57 +-
+ ext/pcre/pcrelib/testdata/testoutput8 | 6 +
+ ext/pcre/pcrelib/testdata/testoutputEBC | 6 +
+ 51 files changed, 3144 insertions(+), 1511 deletions(-)
+
+diff --git a/ext/pcre/pcrelib/ChangeLog b/ext/pcre/pcrelib/ChangeLog
+index 359b412..5e5bf18 100644
+--- a/ext/pcre/pcrelib/ChangeLog
++++ b/ext/pcre/pcrelib/ChangeLog
+@@ -1,6 +1,182 @@
+ ChangeLog for PCRE
+ ------------------
+
++Note that the PCRE 8.xx series (PCRE1) is now in a bugfix-only state. All
++development is happening in the PCRE2 10.xx series.
++
++Version 8.38 23-November-2015
++-----------------------------
++
++1. If a group that contained a recursive back reference also contained a
++ forward reference subroutine call followed by a non-forward-reference
++ subroutine call, for example /.((?2)(?R)\1)()/, pcre2_compile() failed to
++ compile correct code, leading to undefined behaviour or an internally
++ detected error. This bug was discovered by the LLVM fuzzer.
++
++2. Quantification of certain items (e.g. atomic back references) could cause
++ incorrect code to be compiled when recursive forward references were
++ involved. For example, in this pattern: /(?1)()((((((\1++))\x85)+)|))/.
++ This bug was discovered by the LLVM fuzzer.
++
++3. A repeated conditional group whose condition was a reference by name caused
++ a buffer overflow if there was more than one group with the given name.
++ This bug was discovered by the LLVM fuzzer.
++
++4. A recursive back reference by name within a group that had the same name as
++ another group caused a buffer overflow. For example:
++ /(?J)(?'d'(?'d'\g{d}))/. This bug was discovered by the LLVM fuzzer.
++
++5. A forward reference by name to a group whose number is the same as the
++ current group, for example in this pattern: /(?|(\k'Pm')|(?'Pm'))/, caused
++ a buffer overflow at compile time. This bug was discovered by the LLVM
++ fuzzer.
++
++6. A lookbehind assertion within a set of mutually recursive subpatterns could
++ provoke a buffer overflow. This bug was discovered by the LLVM fuzzer.
++
++7. Another buffer overflow bug involved duplicate named groups with a
++ reference between their definition, with a group that reset capture
++ numbers, for example: /(?J:(?|(?'R')(\k'R')|((?'R'))))/. This has been
++ fixed by always allowing for more memory, even if not needed. (A proper fix
++ is implemented in PCRE2, but it involves more refactoring.)
++
++8. There was no check for integer overflow in subroutine calls such as (?123).
++
++9. The table entry for \l in EBCDIC environments was incorrect, leading to its
++ being treated as a literal 'l' instead of causing an error.
++
++10. There was a buffer overflow if pcre_exec() was called with an ovector of
++ size 1. This bug was found by american fuzzy lop.
++
++11. If a non-capturing group containing a conditional group that could match
++ an empty string was repeated, it was not identified as matching an empty
++ string itself. For example: /^(?:(?(1)x|)+)+$()/.
++
++12. In an EBCDIC environment, pcretest was mishandling the escape sequences
++ \a and \e in test subject lines.
++
++13. In an EBCDIC environment, \a in a pattern was converted to the ASCII
++ instead of the EBCDIC value.
++
++14. The handling of \c in an EBCDIC environment has been revised so that it is
++ now compatible with the specification in Perl's perlebcdic page.
++
++15. The EBCDIC character 0x41 is a non-breaking space, equivalent to 0xa0 in
++ ASCII/Unicode. This has now been added to the list of characters that are
++ recognized as white space in EBCDIC.
++
++16. When PCRE was compiled without UCP support, the use of \p and \P gave an
++ error (correctly) when used outside a class, but did not give an error
++ within a class.
++
++17. \h within a class was incorrectly compiled in EBCDIC environments.
++
++18. A pattern with an unmatched closing parenthesis that contained a backward
++ assertion which itself contained a forward reference caused buffer
++ overflow. And example pattern is: /(?=di(?<=(?1))|(?=(.))))/.
++
++19. JIT should return with error when the compiled pattern requires more stack
++ space than the maximum.
++
++20. A possessively repeated conditional group that could match an empty string,
++ for example, /(?(R))*+/, was incorrectly compiled.
++
++21. Fix infinite recursion in the JIT compiler when certain patterns such as
++ /(?:|a|){100}x/ are analysed.
++
++22. Some patterns with character classes involving [: and \\ were incorrectly
++ compiled and could cause reading from uninitialized memory or an incorrect
++ error diagnosis.
++
++23. Pathological patterns containing many nested occurrences of [: caused
++ pcre_compile() to run for a very long time.
++
++24. A conditional group with only one branch has an implicit empty alternative
++ branch and must therefore be treated as potentially matching an empty
++ string.
++
++25. If (?R was followed by - or + incorrect behaviour happened instead of a
++ diagnostic.
++
++26. Arrange to give up on finding the minimum matching length for overly
++ complex patterns.
++
++27. Similar to (4) above: in a pattern with duplicated named groups and an
++ occurrence of (?| it is possible for an apparently non-recursive back
++ reference to become recursive if a later named group with the relevant
++ number is encountered. This could lead to a buffer overflow. Wen Guanxing
++ from Venustech ADLAB discovered this bug.
++
++28. If pcregrep was given the -q option with -c or -l, or when handling a
++ binary file, it incorrectly wrote output to stdout.
++
++29. The JIT compiler did not restore the control verb head in case of *THEN
++ control verbs. This issue was found by Karl Skomski with a custom LLVM
++ fuzzer.
++
++30. Error messages for syntax errors following \g and \k were giving inaccurate
++ offsets in the pattern.
++
++31. Added a check for integer overflow in conditions (?(<digits>) and
++ (?(R<digits>). This omission was discovered by Karl Skomski with the LLVM
++ fuzzer.
++
++32. Handling recursive references such as (?2) when the reference is to a group
++ later in the pattern uses code that is very hacked about and error-prone.
++ It has been re-written for PCRE2. Here in PCRE1, a check has been added to
++ give an internal error if it is obvious that compiling has gone wrong.
++
++33. The JIT compiler should not check repeats after a {0,1} repeat byte code.
++ This issue was found by Karl Skomski with a custom LLVM fuzzer.
++
++34. The JIT compiler should restore the control chain for empty possessive
++ repeats. This issue was found by Karl Skomski with a custom LLVM fuzzer.
++
++35. Match limit check added to JIT recursion. This issue was found by Karl
++ Skomski with a custom LLVM fuzzer.
++
++36. Yet another case similar to 27 above has been circumvented by an
++ unconditional allocation of extra memory. This issue is fixed "properly" in
++ PCRE2 by refactoring the way references are handled. Wen Guanxing
++ from Venustech ADLAB discovered this bug.
++
++37. Fix two assertion fails in JIT. These issues were found by Karl Skomski
++ with a custom LLVM fuzzer.
++
++38. Fixed a corner case of range optimization in JIT.
++
++39. An incorrect error "overran compiling workspace" was given if there were
++ exactly enough group forward references such that the last one extended
++ into the workspace safety margin. The next one would have expanded the
++ workspace. The test for overflow was not including the safety margin.
++
++40. A match limit issue is fixed in JIT which was found by Karl Skomski
++ with a custom LLVM fuzzer.
++
++41. Remove the use of /dev/null in testdata/testinput2, because it doesn't
++ work under Windows. (Why has it taken so long for anyone to notice?)
++
++42. In a character class such as [\W\p{Any}] where both a negative-type escape
++ ("not a word character") and a property escape were present, the property
++ escape was being ignored.
++
++43. Fix crash caused by very long (*MARK) or (*THEN) names.
++
++44. A sequence such as [[:punct:]b] that is, a POSIX character class followed
++ by a single ASCII character in a class item, was incorrectly compiled in
++ UCP mode. The POSIX class got lost, but only if the single character
++ followed it.
++
++45. [:punct:] in UCP mode was matching some characters in the range 128-255
++ that should not have been matched.
++
++46. If [:^ascii:] or [:^xdigit:] or [:^cntrl:] are present in a non-negated
++ class, all characters with code points greater than 255 are in the class.
++ When a Unicode property was also in the class (if PCRE_UCP is set, escapes
++ such as \w are turned into Unicode properties), wide characters were not
++ correctly handled, and could fail to match.
++
++
+ Version 8.37 28-April-2015
+ --------------------------
+
+diff --git a/ext/pcre/pcrelib/NEWS b/ext/pcre/pcrelib/NEWS
+index 064bf27..7e42dcb 100644
+--- a/ext/pcre/pcrelib/NEWS
++++ b/ext/pcre/pcrelib/NEWS
+@@ -1,6 +1,14 @@
+ News about PCRE releases
+ ------------------------
+
++Release 8.38 23-November-2015
++-----------------------------
++
++This is bug-fix release. Note that this library (now called PCRE1) is now being
++maintained for bug fixes only. New projects are advised to use the new PCRE2
++libraries.
++
++
+ Release 8.37 28-April-2015
+ --------------------------
+
+diff --git a/ext/pcre/pcrelib/config.h b/ext/pcre/pcrelib/config.h
+index ba06a17..0f7a9f7 100644
+--- a/ext/pcre/pcrelib/config.h
++++ b/ext/pcre/pcrelib/config.h
+@@ -234,8 +234,8 @@ them both to 0; an emulation function will be used. */
+ #define LINK_SIZE 2
+ #endif
+
+-/* Define to the sub-directory in which libtool stores uninstalled libraries.
+- */
++/* Define to the sub-directory where libtool stores uninstalled libraries. */
++/* This is ignored unless you are using libtool. */
+ #ifndef LT_OBJDIR
+ #define LT_OBJDIR ".libs/"
+ #endif
+@@ -314,7 +314,7 @@ them both to 0; an emulation function will be used. */
+ #define PACKAGE_NAME "PCRE"
+
+ /* Define to the full name and version of this package. */
+-#define PACKAGE_STRING "PCRE 8.37"
++#define PACKAGE_STRING "PCRE 8.38"
+
+ /* Define to the one symbol short name of this package. */
+ #define PACKAGE_TARNAME "pcre"
+@@ -323,7 +323,7 @@ them both to 0; an emulation function will be used. */
+ #define PACKAGE_URL ""
+
+ /* Define to the version of this package. */
+-#define PACKAGE_VERSION "8.37"
++#define PACKAGE_VERSION "8.38"
+
+ /* to make a symbol visible */
+ /* #undef PCRECPP_EXP_DECL */
+@@ -439,7 +439,7 @@ them both to 0; an emulation function will be used. */
+
+ /* Version number of package */
+ #ifndef VERSION
+-#define VERSION "8.37"
++#define VERSION "8.38"
+ #endif
+
+ /* Define to empty if `const' does not conform to ANSI C. */
+@@ -451,4 +451,3 @@ them both to 0; an emulation function will be used. */
+
+ /* Define to `unsigned int' if <sys/types.h> does not define. */
+ /* #undef size_t */
+-
+diff --git a/ext/pcre/pcrelib/doc/pcre.txt b/ext/pcre/pcrelib/doc/pcre.txt
+index ce27f4b..76a47c7 100644
+--- a/ext/pcre/pcrelib/doc/pcre.txt
++++ b/ext/pcre/pcrelib/doc/pcre.txt
+@@ -13,7 +13,18 @@ PCRE(3) Library Functions Manual PCRE(3)
+
+
+ NAME
+- PCRE - Perl-compatible regular expressions
++ PCRE - Perl-compatible regular expressions (original API)
++
++PLEASE TAKE NOTE
++
++ This document relates to PCRE releases that use the original API, with
++ library names libpcre, libpcre16, and libpcre32. January 2015 saw the
++ first release of a new API, known as PCRE2, with release numbers start-
++ ing at 10.00 and library names libpcre2-8, libpcre2-16, and
++ libpcre2-32. The old libraries (now called PCRE1) are still being main-
++ tained for bug fixes, but there will be no new development. New
++ projects are advised to use the new PCRE2 libraries.
++
+
+ INTRODUCTION
+
+@@ -179,8 +190,8 @@ AUTHOR
+
+ REVISION
+
+- Last updated: 08 January 2014
+- Copyright (c) 1997-2014 University of Cambridge.
++ Last updated: 10 February 2015
++ Copyright (c) 1997-2015 University of Cambridge.
+ ------------------------------------------------------------------------------
+
+
+@@ -4989,7 +5000,8 @@ BACKSLASH
+ appearance of non-printing characters, apart from the binary zero that
+ terminates a pattern, but when a pattern is being prepared by text
+ editing, it is often easier to use one of the following escape
+- sequences than the binary character it represents:
++ sequences than the binary character it represents. In an ASCII or Uni-
++ code environment, these escapes are as follows:
+
+ \a alarm, that is, the BEL character (hex 07)
+ \cx "control-x", where x is any ASCII character
+@@ -5005,55 +5017,67 @@ BACKSLASH
+ \x{hhh..} character with hex code hhh.. (non-JavaScript mode)
+ \uhhhh character with hex code hhhh (JavaScript mode only)
+
+- The precise effect of \cx on ASCII characters is as follows: if x is a
+- lower case letter, it is converted to upper case. Then bit 6 of the
++ The precise effect of \cx on ASCII characters is as follows: if x is a
++ lower case letter, it is converted to upper case. Then bit 6 of the
+ character (hex 40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A
+- (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes
+- hex 7B (; is 3B). If the data item (byte or 16-bit value) following \c
+- has a value greater than 127, a compile-time error occurs. This locks
++ (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes
++ hex 7B (; is 3B). If the data item (byte or 16-bit value) following \c
++ has a value greater than 127, a compile-time error occurs. This locks
+ out non-ASCII characters in all modes.
+
+- The \c facility was designed for use with ASCII characters, but with
+- the extension to Unicode it is even less useful than it once was. It
+- is, however, recognized when PCRE is compiled in EBCDIC mode, where
+- data items are always bytes. In this mode, all values are valid after
+- \c. If the next character is a lower case letter, it is converted to
+- upper case. Then the 0xc0 bits of the byte are inverted. Thus \cA
+- becomes hex 01, as in ASCII (A is C1), but because the EBCDIC letters
+- are disjoint, \cZ becomes hex 29 (Z is E9), and other characters also
+- generate different values.
+-
+- After \0 up to two further octal digits are read. If there are fewer
+- than two digits, just those that are present are used. Thus the
+- sequence \0\x\07 specifies two binary zeros followed by a BEL character
+- (code value 7). Make sure you supply two digits after the initial zero
++ When PCRE is compiled in EBCDIC mode, \a, \e, \f, \n, \r, and \t gener-
++ ate the appropriate EBCDIC code values. The \c escape is processed as
++ specified for Perl in the perlebcdic document. The only characters that
++ are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?.
++ Any other character provokes a compile-time error. The sequence \@
++ encodes character code 0; the letters (in either case) encode charac-
++ ters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31
++ (hex 1B to hex 1F), and \? becomes either 255 (hex FF) or 95 (hex 5F).
++
++ Thus, apart from \?, these escapes generate the same character code
++ values as they do in an ASCII environment, though the meanings of the
++ values mostly differ. For example, \G always generates code value 7,
++ which is BEL in ASCII but DEL in EBCDIC.
++
++ The sequence \? generates DEL (127, hex 7F) in an ASCII environment,
++ but because 127 is not a control character in EBCDIC, Perl makes it
++ generate the APC character. Unfortunately, there are several variants
++ of EBCDIC. In most of them the APC character has the value 255 (hex
++ FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If
++ certain other characters have POSIX-BC values, PCRE makes \? generate
++ 95; otherwise it generates 255.
++
++ After \0 up to two further octal digits are read. If there are fewer
++ than two digits, just those that are present are used. Thus the
++ sequence \0\x\015 specifies two binary zeros followed by a CR character
++ (code value 13). Make sure you supply two digits after the initial zero
+ if the pattern character that follows is itself an octal digit.
+
+- The escape \o must be followed by a sequence of octal digits, enclosed
+- in braces. An error occurs if this is not the case. This escape is a
+- recent addition to Perl; it provides way of specifying character code
+- points as octal numbers greater than 0777, and it also allows octal
++ The escape \o must be followed by a sequence of octal digits, enclosed
++ in braces. An error occurs if this is not the case. This escape is a
++ recent addition to Perl; it provides way of specifying character code
++ points as octal numbers greater than 0777, and it also allows octal
+ numbers and back references to be unambiguously specified.
+
+ For greater clarity and unambiguity, it is best to avoid following \ by
+ a digit greater than zero. Instead, use \o{} or \x{} to specify charac-
+- ter numbers, and \g{} to specify back references. The following para-
++ ter numbers, and \g{} to specify back references. The following para-
+ graphs describe the old, ambiguous syntax.
+
+ The handling of a backslash followed by a digit other than 0 is compli-
+- cated, and Perl has changed in recent releases, causing PCRE also to
++ cated, and Perl has changed in recent releases, causing PCRE also to
+ change. Outside a character class, PCRE reads the digit and any follow-
+- ing digits as a decimal number. If the number is less than 8, or if
+- there have been at least that many previous capturing left parentheses
+- in the expression, the entire sequence is taken as a back reference. A
+- description of how this works is given later, following the discussion
++ ing digits as a decimal number. If the number is less than 8, or if
++ there have been at least that many previous capturing left parentheses
++ in the expression, the entire sequence is taken as a back reference. A
++ description of how this works is given later, following the discussion
+ of parenthesized subpatterns.
+
+- Inside a character class, or if the decimal number following \ is
++ Inside a character class, or if the decimal number following \ is
+ greater than 7 and there have not been that many capturing subpatterns,
+- PCRE handles \8 and \9 as the literal characters "8" and "9", and oth-
++ PCRE handles \8 and \9 as the literal characters "8" and "9", and oth-
+ erwise re-reads up to three octal digits following the backslash, using
+- them to generate a data character. Any subsequent digits stand for
++ them to generate a data character. Any subsequent digits stand for
+ themselves. For example:
+
+ \040 is another way of writing an ASCII space
+@@ -5071,31 +5095,31 @@ BACKSLASH
+ \81 is either a back reference, or the two
+ characters "8" and "1"
+
+- Note that octal values of 100 or greater that are specified using this
+- syntax must not be introduced by a leading zero, because no more than
++ Note that octal values of 100 or greater that are specified using this
++ syntax must not be introduced by a leading zero, because no more than
+ three octal digits are ever read.
+
+- By default, after \x that is not followed by {, from zero to two hexa-
+- decimal digits are read (letters can be in upper or lower case). Any
++ By default, after \x that is not followed by {, from zero to two hexa-
++ decimal digits are read (letters can be in upper or lower case). Any
+ number of hexadecimal digits may appear between \x{ and }. If a charac-
+- ter other than a hexadecimal digit appears between \x{ and }, or if
++ ter other than a hexadecimal digit appears between \x{ and }, or if
+ there is no terminating }, an error occurs.
+
+- If the PCRE_JAVASCRIPT_COMPAT option is set, the interpretation of \x
+- is as just described only when it is followed by two hexadecimal dig-
+- its. Otherwise, it matches a literal "x" character. In JavaScript
++ If the PCRE_JAVASCRIPT_COMPAT option is set, the interpretation of \x
++ is as just described only when it is followed by two hexadecimal dig-
++ its. Otherwise, it matches a literal "x" character. In JavaScript
+ mode, support for code points greater than 256 is provided by \u, which
+- must be followed by four hexadecimal digits; otherwise it matches a
++ must be followed by four hexadecimal digits; otherwise it matches a
+ literal "u" character.
+
+ Characters whose value is less than 256 can be defined by either of the
+- two syntaxes for \x (or by \u in JavaScript mode). There is no differ-
++ two syntaxes for \x (or by \u in JavaScript mode). There is no differ-
+ ence in the way they are handled. For example, \xdc is exactly the same
+ as \x{dc} (or \u00dc in JavaScript mode).
+
+ Constraints on character values
+
+- Characters that are specified using octal or hexadecimal numbers are
++ Characters that are specified using octal or hexadecimal numbers are
+ limited to certain values, as follows:
+
+ 8-bit non-UTF mode less than 0x100
+@@ -5105,44 +5129,44 @@ BACKSLASH
+ 32-bit non-UTF mode less than 0x100000000
+ 32-bit UTF-32 mode less than 0x10ffff and a valid codepoint
+
+- Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-
++ Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-
+ called "surrogate" codepoints), and 0xffef.
+
+ Escape sequences in character classes
+
+ All the sequences that define a single character value can be used both
+- inside and outside character classes. In addition, inside a character
++ inside and outside character classes. In addition, inside a character
+ class, \b is interpreted as the backspace character (hex 08).
+
+- \N is not allowed in a character class. \B, \R, and \X are not special
+- inside a character class. Like other unrecognized escape sequences,
+- they are treated as the literal characters "B", "R", and "X" by
+- default, but cause an error if the PCRE_EXTRA option is set. Outside a
++ \N is not allowed in a character class. \B, \R, and \X are not special
++ inside a character class. Like other unrecognized escape sequences,
++ they are treated as the literal characters "B", "R", and "X" by
++ default, but cause an error if the PCRE_EXTRA option is set. Outside a
+ character class, these sequences have different meanings.
+
+ Unsupported escape sequences
+
+- In Perl, the sequences \l, \L, \u, and \U are recognized by its string
+- handler and used to modify the case of following characters. By
+- default, PCRE does not support these escape sequences. However, if the
+- PCRE_JAVASCRIPT_COMPAT option is set, \U matches a "U" character, and
++ In Perl, the sequences \l, \L, \u, and \U are recognized by its string
++ handler and used to modify the case of following characters. By
++ default, PCRE does not support these escape sequences. However, if the
++ PCRE_JAVASCRIPT_COMPAT option is set, \U matches a "U" character, and
+ \u can be used to define a character by code point, as described in the
+ previous section.
+
+ Absolute and relative back references
+
+- The sequence \g followed by an unsigned or a negative number, option-
+- ally enclosed in braces, is an absolute or relative back reference. A
++ The sequence \g followed by an unsigned or a negative number, option-
++ ally enclosed in braces, is an absolute or relative back reference. A
+ named back reference can be coded as \g{name}. Back references are dis-
+ cussed later, following the discussion of parenthesized subpatterns.
+
+ Absolute and relative subroutine calls
+
+- For compatibility with Oniguruma, the non-Perl syntax \g followed by a
++ For compatibility with Oniguruma, the non-Perl syntax \g followed by a
+ name or a number enclosed either in angle brackets or single quotes, is
+- an alternative syntax for referencing a subpattern as a "subroutine".
+- Details are discussed later. Note that \g{...} (Perl syntax) and
+- \g<...> (Oniguruma syntax) are not synonymous. The former is a back
++ an alternative syntax for referencing a subpattern as a "subroutine".
++ Details are discussed later. Note that \g{...} (Perl syntax) and
++ \g<...> (Oniguruma syntax) are not synonymous. The former is a back
+ reference; the latter is a subroutine call.
+
+ Generic character types
+@@ -5161,59 +5185,59 @@ BACKSLASH
+ \W any "non-word" character
+
+ There is also the single sequence \N, which matches a non-newline char-
+- acter. This is the same as the "." metacharacter when PCRE_DOTALL is
+- not set. Perl also uses \N to match characters by name; PCRE does not
++ acter. This is the same as the "." metacharacter when PCRE_DOTALL is
++ not set. Perl also uses \N to match characters by name; PCRE does not
+ support this.
+
+- Each pair of lower and upper case escape sequences partitions the com-
+- plete set of characters into two disjoint sets. Any given character
+- matches one, and only one, of each pair. The sequences can appear both
+- inside and outside character classes. They each match one character of
+- the appropriate type. If the current matching point is at the end of
+- the subject string, all of them fail, because there is no character to
++ Each pair of lower and upper case escape sequences partitions the com-
++ plete set of characters into two disjoint sets. Any given character
++ matches one, and only one, of each pair. The sequences can appear both
++ inside and outside character classes. They each match one character of
++ the appropriate type. If the current matching point is at the end of
++ the subject string, all of them fail, because there is no character to
+ match.
+
+- For compatibility with Perl, \s did not used to match the VT character
+- (code 11), which made it different from the the POSIX "space" class.
+- However, Perl added VT at release 5.18, and PCRE followed suit at
+- release 8.34. The default \s characters are now HT (9), LF (10), VT
+- (11), FF (12), CR (13), and space (32), which are defined as white
++ For compatibility with Perl, \s did not used to match the VT character
++ (code 11), which made it different from the the POSIX "space" class.
++ However, Perl added VT at release 5.18, and PCRE followed suit at
++ release 8.34. The default \s characters are now HT (9), LF (10), VT
++ (11), FF (12), CR (13), and space (32), which are defined as white
+ space in the "C" locale. This list may vary if locale-specific matching
+- is taking place. For example, in some locales the "non-breaking space"
+- character (\xA0) is recognized as white space, and in others the VT
++ is taking place. For example, in some locales the "non-breaking space"
++ character (\xA0) is recognized as white space, and in others the VT
+ character is not.
+
+- A "word" character is an underscore or any character that is a letter
+- or digit. By default, the definition of letters and digits is con-
+- trolled by PCRE's low-valued character tables, and may vary if locale-
+- specific matching is taking place (see "Locale support" in the pcreapi
+- page). For example, in a French locale such as "fr_FR" in Unix-like
+- systems, or "french" in Windows, some character codes greater than 127
+- are used for accented letters, and these are then matched by \w. The
++ A "word" character is an underscore or any character that is a letter
++ or digit. By default, the definition of letters and digits is con-
++ trolled by PCRE's low-valued character tables, and may vary if locale-
++ specific matching is taking place (see "Locale support" in the pcreapi
++ page). For example, in a French locale such as "fr_FR" in Unix-like
++ systems, or "french" in Windows, some character codes greater than 127
++ are used for accented letters, and these are then matched by \w. The
+ use of locales with Unicode is discouraged.
+
+- By default, characters whose code points are greater than 127 never
++ By default, characters whose code points are greater than 127 never
+ match \d, \s, or \w, and always match \D, \S, and \W, although this may
+- vary for characters in the range 128-255 when locale-specific matching
+- is happening. These escape sequences retain their original meanings
+- from before Unicode support was available, mainly for efficiency rea-
+- sons. If PCRE is compiled with Unicode property support, and the
+- PCRE_UCP option is set, the behaviour is changed so that Unicode prop-
++ vary for characters in the range 128-255 when locale-specific matching
++ is happening. These escape sequences retain their original meanings
++ from before Unicode support was available, mainly for efficiency rea-
++ sons. If PCRE is compiled with Unicode property support, and the
++ PCRE_UCP option is set, the behaviour is changed so that Unicode prop-
+ erties are used to determine character types, as follows:
+
+ \d any character that matches \p{Nd} (decimal digit)
+ \s any character that matches \p{Z} or \h or \v
+ \w any character that matches \p{L} or \p{N}, plus underscore
+
+- The upper case escapes match the inverse sets of characters. Note that
+- \d matches only decimal digits, whereas \w matches any Unicode digit,
+- as well as any Unicode letter, and underscore. Note also that PCRE_UCP
+- affects \b, and \B because they are defined in terms of \w and \W.
++ The upper case escapes match the inverse sets of characters. Note that
++ \d matches only decimal digits, whereas \w matches any Unicode digit,
++ as well as any Unicode letter, and underscore. Note also that PCRE_UCP
++ affects \b, and \B because they are defined in terms of \w and \W.
+ Matching these sequences is noticeably slower when PCRE_UCP is set.
+
+- The sequences \h, \H, \v, and \V are features that were added to Perl
+- at release 5.10. In contrast to the other sequences, which match only
+- ASCII characters by default, these always match certain high-valued
++ The sequences \h, \H, \v, and \V are features that were added to Perl
++ at release 5.10. In contrast to the other sequences, which match only
++ ASCII characters by default, these always match certain high-valued
+ code points, whether or not PCRE_UCP is set. The horizontal space char-
+ acters are:
+
+@@ -5252,110 +5276,110 @@ BACKSLASH
+
+ Newline sequences
+
+- Outside a character class, by default, the escape sequence \R matches
+- any Unicode newline sequence. In 8-bit non-UTF-8 mode \R is equivalent
++ Outside a character class, by default, the escape sequence \R matches
++ any Unicode newline sequence. In 8-bit non-UTF-8 mode \R is equivalent
+ to the following:
+
+ (?>\r\n|\n|\x0b|\f|\r|\x85)
+
+- This is an example of an "atomic group", details of which are given
++ This is an example of an "atomic group", details of which are given
+ below. This particular group matches either the two-character sequence
+- CR followed by LF, or one of the single characters LF (linefeed,
+- U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (car-
+- riage return, U+000D), or NEL (next line, U+0085). The two-character
++ CR followed by LF, or one of the single characters LF (linefeed,
++ U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (car-
++ riage return, U+000D), or NEL (next line, U+0085). The two-character
+ sequence is treated as a single unit that cannot be split.
+
+- In other modes, two additional characters whose codepoints are greater
++ In other modes, two additional characters whose codepoints are greater
+ than 255 are added: LS (line separator, U+2028) and PS (paragraph sepa-
+- rator, U+2029). Unicode character property support is not needed for
++ rator, U+2029). Unicode character property support is not needed for
+ these characters to be recognized.
+
+ It is possible to restrict \R to match only CR, LF, or CRLF (instead of
+- the complete set of Unicode line endings) by setting the option
++ the complete set of Unicode line endings) by setting the option
+ PCRE_BSR_ANYCRLF either at compile time or when the pattern is matched.
+ (BSR is an abbrevation for "backslash R".) This can be made the default
+- when PCRE is built; if this is the case, the other behaviour can be
+- requested via the PCRE_BSR_UNICODE option. It is also possible to
+- specify these settings by starting a pattern string with one of the
++ when PCRE is built; if this is the case, the other behaviour can be
++ requested via the PCRE_BSR_UNICODE option. It is also possible to
++ specify these settings by starting a pattern string with one of the
+ following sequences:
+
+ (*BSR_ANYCRLF) CR, LF, or CRLF only
+ (*BSR_UNICODE) any Unicode newline sequence
+
+ These override the default and the options given to the compiling func-
+- tion, but they can themselves be overridden by options given to a
+- matching function. Note that these special settings, which are not
+- Perl-compatible, are recognized only at the very start of a pattern,
+- and that they must be in upper case. If more than one of them is
+- present, the last one is used. They can be combined with a change of
++ tion, but they can themselves be overridden by options given to a
++ matching function. Note that these special settings, which are not
++ Perl-compatible, are recognized only at the very start of a pattern,
++ and that they must be in upper case. If more than one of them is
++ present, the last one is used. They can be combined with a change of
+ newline convention; for example, a pattern can start with:
+
+ (*ANY)(*BSR_ANYCRLF)
+
+- They can also be combined with the (*UTF8), (*UTF16), (*UTF32), (*UTF)
++ They can also be combined with the (*UTF8), (*UTF16), (*UTF32), (*UTF)
+ or (*UCP) special sequences. Inside a character class, \R is treated as
+- an unrecognized escape sequence, and so matches the letter "R" by
++ an unrecognized escape sequence, and so matches the letter "R" by
+ default, but causes an error if PCRE_EXTRA is set.
+
+ Unicode character properties
+
+ When PCRE is built with Unicode character property support, three addi-
+- tional escape sequences that match characters with specific properties
+- are available. When in 8-bit non-UTF-8 mode, these sequences are of
+- course limited to testing characters whose codepoints are less than
++ tional escape sequences that match characters with specific properties
++ are available. When in 8-bit non-UTF-8 mode, these sequences are of
++ course limited to testing characters whose codepoints are less than
+ 256, but they do work in this mode. The extra escape sequences are:
+
+ \p{xx} a character with the xx property
+ \P{xx} a character without the xx property
+ \X a Unicode extended grapheme cluster
+
+- The property names represented by xx above are limited to the Unicode
++ The property names represented by xx above are limited to the Unicode
+ script names, the general category properties, "Any", which matches any
+- character (including newline), and some special PCRE properties
+- (described in the next section). Other Perl properties such as "InMu-
+- sicalSymbols" are not currently supported by PCRE. Note that \P{Any}
++ character (including newline), and some special PCRE properties
++ (described in the next section). Other Perl properties such as "InMu-
++ sicalSymbols" are not currently supported by PCRE. Note that \P{Any}
+ does not match any characters, so always causes a match failure.
+
+ Sets of Unicode characters are defined as belonging to certain scripts.
+- A character from one of these sets can be matched using a script name.
++ A character from one of these sets can be matched using a script name.
+ For example:
+
+ \p{Greek}
+ \P{Han}
+
+- Those that are not part of an identified script are lumped together as
++ Those that are not part of an identified script are lumped together as
+ "Common". The current list of scripts is:
+
+- Arabic, Armenian, Avestan, Balinese, Bamum, Bassa_Vah, Batak, Bengali,
+- Bopomofo, Brahmi, Braille, Buginese, Buhid, Canadian_Aboriginal, Car-
++ Arabic, Armenian, Avestan, Balinese, Bamum, Bassa_Vah, Batak, Bengali,
++ Bopomofo, Brahmi, Braille, Buginese, Buhid, Canadian_Aboriginal, Car-
+ ian, Caucasian_Albanian, Chakma, Cham, Cherokee, Common, Coptic, Cunei-
+ form, Cypriot, Cyrillic, Deseret, Devanagari, Duployan, Egyptian_Hiero-
+ glyphs, Elbasan, Ethiopic, Georgian, Glagolitic, Gothic, Grantha,
+- Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, Hiragana,
+- Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip-
+- tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li,
+- Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Latin, Lepcha, Limbu, Lin-
+- ear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani, Malayalam, Mandaic,
+- Manichaean, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive,
+- Meroitic_Hieroglyphs, Miao, Modi, Mongolian, Mro, Myanmar, Nabataean,
+- New_Tai_Lue, Nko, Ogham, Ol_Chiki, Old_Italic, Old_North_Arabian,
++ Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, Hiragana,
++ Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip-
++ tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li,
++ Kharoshthi, Khmer, Khojki, Khudawadi, Lao, Latin, Lepcha, Limbu, Lin-
++ ear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani, Malayalam, Mandaic,
++ Manichaean, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive,
++ Meroitic_Hieroglyphs, Miao, Modi, Mongolian, Mro, Myanmar, Nabataean,
++ New_Tai_Lue, Nko, Ogham, Ol_Chiki, Old_Italic, Old_North_Arabian,
+ Old_Permic, Old_Persian, Old_South_Arabian, Old_Turkic, Oriya, Osmanya,
+ Pahawh_Hmong, Palmyrene, Pau_Cin_Hau, Phags_Pa, Phoenician,
+- Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra, Sharada, Sha-
+- vian, Siddham, Sinhala, Sora_Sompeng, Sundanese, Syloti_Nagri, Syriac,
+- Tagalog, Tagbanwa, Tai_Le, Tai_Tham, Tai_Viet, Takri, Tamil, Telugu,
+- Thaana, Thai, Tibetan, Tifinagh, Tirhuta, Ugaritic, Vai, Warang_Citi,
++ Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra, Sharada, Sha-
++ vian, Siddham, Sinhala, Sora_Sompeng, Sundanese, Syloti_Nagri, Syriac,
++ Tagalog, Tagbanwa, Tai_Le, Tai_Tham, Tai_Viet, Takri, Tamil, Telugu,
++ Thaana, Thai, Tibetan, Tifinagh, Tirhuta, Ugaritic, Vai, Warang_Citi,
+ Yi.
+
+ Each character has exactly one Unicode general category property, spec-
+- ified by a two-letter abbreviation. For compatibility with Perl, nega-
+- tion can be specified by including a circumflex between the opening
+- brace and the property name. For example, \p{^Lu} is the same as
++ ified by a two-letter abbreviation. For compatibility with Perl, nega-
++ tion can be specified by including a circumflex between the opening
++ brace and the property name. For example, \p{^Lu} is the same as
+ \P{Lu}.
+
+ If only one letter is specified with \p or \P, it includes all the gen-
+- eral category properties that start with that letter. In this case, in
+- the absence of negation, the curly brackets in the escape sequence are
++ eral category properties that start with that letter. In this case, in
++ the absence of negation, the curly brackets in the escape sequence are
+ optional; these two examples have the same effect:
+
+ \p{L}
+@@ -5407,73 +5431,73 @@ BACKSLASH
+ Zp Paragraph separator
+ Zs Space separator
+
+- The special property L& is also supported: it matches a character that
+- has the Lu, Ll, or Lt property, in other words, a letter that is not
++ The special property L& is also supported: it matches a character that
++ has the Lu, Ll, or Lt property, in other words, a letter that is not
+ classified as a modifier or "other".
+
+- The Cs (Surrogate) property applies only to characters in the range
+- U+D800 to U+DFFF. Such characters are not valid in Unicode strings and
+- so cannot be tested by PCRE, unless UTF validity checking has been
++ The Cs (Surrogate) property applies only to characters in the range
++ U+D800 to U+DFFF. Such characters are not valid in Unicode strings and
++ so cannot be tested by PCRE, unless UTF validity checking has been
+ turned off (see the discussion of PCRE_NO_UTF8_CHECK,
+- PCRE_NO_UTF16_CHECK and PCRE_NO_UTF32_CHECK in the pcreapi page). Perl
++ PCRE_NO_UTF16_CHECK and PCRE_NO_UTF32_CHECK in the pcreapi page). Perl
+ does not support the Cs property.
+
+- The long synonyms for property names that Perl supports (such as
+- \p{Letter}) are not supported by PCRE, nor is it permitted to prefix
++ The long synonyms for property names that Perl supports (such as
++ \p{Letter}) are not supported by PCRE, nor is it permitted to prefix
+ any of these properties with "Is".
+
+ No character that is in the Unicode table has the Cn (unassigned) prop-
+ erty. Instead, this property is assumed for any code point that is not
+ in the Unicode table.
+
+- Specifying caseless matching does not affect these escape sequences.
+- For example, \p{Lu} always matches only upper case letters. This is
++ Specifying caseless matching does not affect these escape sequences.
++ For example, \p{Lu} always matches only upper case letters. This is
+ different from the behaviour of current versions of Perl.
+
+- Matching characters by Unicode property is not fast, because PCRE has
+- to do a multistage table lookup in order to find a character's prop-
++ Matching characters by Unicode property is not fast, because PCRE has
++ to do a multistage table lookup in order to find a character's prop-
+ erty. That is why the traditional escape sequences such as \d and \w do
+ not use Unicode properties in PCRE by default, though you can make them
+- do so by setting the PCRE_UCP option or by starting the pattern with
++ do so by setting the PCRE_UCP option or by starting the pattern with
+ (*UCP).
+
+ Extended grapheme clusters
+
+- The \X escape matches any number of Unicode characters that form an
++ The \X escape matches any number of Unicode characters that form an
+ "extended grapheme cluster", and treats the sequence as an atomic group
+- (see below). Up to and including release 8.31, PCRE matched an ear-
++ (see below). Up to and including release 8.31, PCRE matched an ear-
+ lier, simpler definition that was equivalent to
+
+ (?>\PM\pM*)
+
+- That is, it matched a character without the "mark" property, followed
+- by zero or more characters with the "mark" property. Characters with
+- the "mark" property are typically non-spacing accents that affect the
++ That is, it matched a character without the "mark" property, followed
++ by zero or more characters with the "mark" property. Characters with
++ the "mark" property are typically non-spacing accents that affect the
+ preceding character.
+
+- This simple definition was extended in Unicode to include more compli-
+- cated kinds of composite character by giving each character a grapheme
+- breaking property, and creating rules that use these properties to
+- define the boundaries of extended grapheme clusters. In releases of
++ This simple definition was extended in Unicode to include more compli-
++ cated kinds of composite character by giving each character a grapheme
++ breaking property, and creating rules that use these properties to
++ define the boundaries of extended grapheme clusters. In releases of
+ PCRE later than 8.31, \X matches one of these clusters.
+
+- \X always matches at least one character. Then it decides whether to
++ \X always matches at least one character. Then it decides whether to
+ add additional characters according to the following rules for ending a
+ cluster:
+
+ 1. End at the end of the subject string.
+
+- 2. Do not end between CR and LF; otherwise end after any control char-
++ 2. Do not end between CR and LF; otherwise end after any control char-
+ acter.
+
+- 3. Do not break Hangul (a Korean script) syllable sequences. Hangul
+- characters are of five types: L, V, T, LV, and LVT. An L character may
+- be followed by an L, V, LV, or LVT character; an LV or V character may
++ 3. Do not break Hangul (a Korean script) syllable sequences. Hangul
++ characters are of five types: L, V, T, LV, and LVT. An L character may
++ be followed by an L, V, LV, or LVT character; an LV or V character may
+ be followed by a V or T character; an LVT or T character may be follwed
+ only by a T character.
+
+- 4. Do not end before extending characters or spacing marks. Characters
+- with the "mark" property always have the "extend" grapheme breaking
++ 4. Do not end before extending characters or spacing marks. Characters
++ with the "mark" property always have the "extend" grapheme breaking
+ property.
+
+ 5. Do not end after prepend characters.
+@@ -5482,9 +5506,9 @@ BACKSLASH
+
+ PCRE's additional properties
+
+- As well as the standard Unicode properties described above, PCRE sup-
+- ports four more that make it possible to convert traditional escape
+- sequences such as \w and \s to use Unicode properties. PCRE uses these
++ As well as the standard Unicode properties described above, PCRE sup-
++ ports four more that make it possible to convert traditional escape
++ sequences such as \w and \s to use Unicode properties. PCRE uses these
+ non-standard, non-Perl properties internally when PCRE_UCP is set. How-
+ ever, they may also be used explicitly. These properties are:
+
+@@ -5493,54 +5517,54 @@ BACKSLASH
+ Xsp Any Perl space character
+ Xwd Any Perl "word" character
+
+- Xan matches characters that have either the L (letter) or the N (num-
+- ber) property. Xps matches the characters tab, linefeed, vertical tab,
+- form feed, or carriage return, and any other character that has the Z
+- (separator) property. Xsp is the same as Xps; it used to exclude ver-
+- tical tab, for Perl compatibility, but Perl changed, and so PCRE fol-
+- lowed at release 8.34. Xwd matches the same characters as Xan, plus
++ Xan matches characters that have either the L (letter) or the N (num-
++ ber) property. Xps matches the characters tab, linefeed, vertical tab,
++ form feed, or carriage return, and any other character that has the Z
++ (separator) property. Xsp is the same as Xps; it used to exclude ver-
++ tical tab, for Perl compatibility, but Perl changed, and so PCRE fol-
++ lowed at release 8.34. Xwd matches the same characters as Xan, plus
+ underscore.
+
+- There is another non-standard property, Xuc, which matches any charac-
+- ter that can be represented by a Universal Character Name in C++ and
+- other programming languages. These are the characters $, @, ` (grave
+- accent), and all characters with Unicode code points greater than or
+- equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that
+- most base (ASCII) characters are excluded. (Universal Character Names
+- are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit.
++ There is another non-standard property, Xuc, which matches any charac-
++ ter that can be represented by a Universal Character Name in C++ and
++ other programming languages. These are the characters $, @, ` (grave
++ accent), and all characters with Unicode code points greater than or
++ equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that
++ most base (ASCII) characters are excluded. (Universal Character Names
++ are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit.
+ Note that the Xuc property does not match these sequences but the char-
+ acters that they represent.)
+
+ Resetting the match start
+
+- The escape sequence \K causes any previously matched characters not to
++ The escape sequence \K causes any previously matched characters not to
+ be included in the final matched sequence. For example, the pattern:
+
+ foo\Kbar
+
+- matches "foobar", but reports that it has matched "bar". This feature
+- is similar to a lookbehind assertion (described below). However, in
+- this case, the part of the subject before the real match does not have
+- to be of fixed length, as lookbehind assertions do. The use of \K does
+- not interfere with the setting of captured substrings. For example,
++ matches "foobar", but reports that it has matched "bar". This feature
++ is similar to a lookbehind assertion (described below). However, in
++ this case, the part of the subject before the real match does not have
++ to be of fixed length, as lookbehind assertions do. The use of \K does
++ not interfere with the setting of captured substrings. For example,
+ when the pattern
+
+ (foo)\Kbar
+
+ matches "foobar", the first substring is still set to "foo".
+
+- Perl documents that the use of \K within assertions is "not well
+- defined". In PCRE, \K is acted upon when it occurs inside positive
+- assertions, but is ignored in negative assertions. Note that when a
+- pattern such as (?=ab\K) matches, the reported start of the match can
++ Perl documents that the use of \K within assertions is "not well
++ defined". In PCRE, \K is acted upon when it occurs inside positive
++ assertions, but is ignored in negative assertions. Note that when a
++ pattern such as (?=ab\K) matches, the reported start of the match can
+ be greater than the end of the match.
+
+ Simple assertions
+
+- The final use of backslash is for certain simple assertions. An asser-
+- tion specifies a condition that has to be met at a particular point in
+- a match, without consuming any characters from the subject string. The
+- use of subpatterns for more complicated assertions is described below.
++ The final use of backslash is for certain simple assertions. An asser-
++ tion specifies a condition that has to be met at a particular point in
++ a match, without consuming any characters from the subject string. The
++ use of subpatterns for more complicated assertions is described below.
+ The backslashed assertions are:
+
+ \b matches at a word boundary
+@@ -5551,161 +5575,161 @@ BACKSLASH
+ \z matches only at the end of the subject
+ \G matches at the first matching position in the subject
+
+- Inside a character class, \b has a different meaning; it matches the
+- backspace character. If any other of these assertions appears in a
+- character class, by default it matches the corresponding literal char-
++ Inside a character class, \b has a different meaning; it matches the
++ backspace character. If any other of these assertions appears in a
++ character class, by default it matches the corresponding literal char-
+ acter (for example, \B matches the letter B). However, if the
+- PCRE_EXTRA option is set, an "invalid escape sequence" error is gener-
++ PCRE_EXTRA option is set, an "invalid escape sequence" error is gener-
+ ated instead.
+
+- A word boundary is a position in the subject string where the current
+- character and the previous character do not both match \w or \W (i.e.
+- one matches \w and the other matches \W), or the start or end of the
+- string if the first or last character matches \w, respectively. In a
+- UTF mode, the meanings of \w and \W can be changed by setting the
+- PCRE_UCP option. When this is done, it also affects \b and \B. Neither
+- PCRE nor Perl has a separate "start of word" or "end of word" metase-
+- quence. However, whatever follows \b normally determines which it is.
++ A word boundary is a position in the subject string where the current
++ character and the previous character do not both match \w or \W (i.e.
++ one matches \w and the other matches \W), or the start or end of the
++ string if the first or last character matches \w, respectively. In a
++ UTF mode, the meanings of \w and \W can be changed by setting the
++ PCRE_UCP option. When this is done, it also affects \b and \B. Neither
++ PCRE nor Perl has a separate "start of word" or "end of word" metase-
++ quence. However, whatever follows \b normally determines which it is.
+ For example, the fragment \ba matches "a" at the start of a word.
+
+- The \A, \Z, and \z assertions differ from the traditional circumflex
++ The \A, \Z, and \z assertions differ from the traditional circumflex
+ and dollar (described in the next section) in that they only ever match
+- at the very start and end of the subject string, whatever options are
+- set. Thus, they are independent of multiline mode. These three asser-
++ at the very start and end of the subject string, whatever options are
++ set. Thus, they are independent of multiline mode. These three asser-
+ tions are not affected by the PCRE_NOTBOL or PCRE_NOTEOL options, which
+- affect only the behaviour of the circumflex and dollar metacharacters.
+- However, if the startoffset argument of pcre_exec() is non-zero, indi-
++ affect only the behaviour of the circumflex and dollar metacharacters.
++ However, if the startoffset argument of pcre_exec() is non-zero, indi-
+ cating that matching is to start at a point other than the beginning of
+- the subject, \A can never match. The difference between \Z and \z is
++ the subject, \A can never match. The difference between \Z and \z is
+ that \Z matches before a newline at the end of the string as well as at
+ the very end, whereas \z matches only at the end.
+
+- The \G assertion is true only when the current matching position is at
+- the start point of the match, as specified by the startoffset argument
+- of pcre_exec(). It differs from \A when the value of startoffset is
+- non-zero. By calling pcre_exec() multiple times with appropriate argu-
++ The \G assertion is true only when the current matching position is at
++ the start point of the match, as specified by the startoffset argument
++ of pcre_exec(). It differs from \A when the value of startoffset is
++ non-zero. By calling pcre_exec() multiple times with appropriate argu-
+ ments, you can mimic Perl's /g option, and it is in this kind of imple-
+ mentation where \G can be useful.
+
+- Note, however, that PCRE's interpretation of \G, as the start of the
++ Note, however, that PCRE's interpretation of \G, as the start of the
+ current match, is subtly different from Perl's, which defines it as the
+- end of the previous match. In Perl, these can be different when the
+- previously matched string was empty. Because PCRE does just one match
++ end of the previous match. In Perl, these can be different when the
++ previously matched string was empty. Because PCRE does just one match
+ at a time, it cannot reproduce this behaviour.
+
+- If all the alternatives of a pattern begin with \G, the expression is
++ If all the alternatives of a pattern begin with \G, the expression is
+ anchored to the starting match position, and the "anchored" flag is set
+ in the compiled regular expression.
+
+
+ CIRCUMFLEX AND DOLLAR
+
+- The circumflex and dollar metacharacters are zero-width assertions.
+- That is, they test for a particular condition being true without con-
++ The circumflex and dollar metacharacters are zero-width assertions.
++ That is, they test for a particular condition being true without con-
+ suming any characters from the subject string.
+
+ Outside a character class, in the default matching mode, the circumflex
+- character is an assertion that is true only if the current matching
+- point is at the start of the subject string. If the startoffset argu-
+- ment of pcre_exec() is non-zero, circumflex can never match if the
+- PCRE_MULTILINE option is unset. Inside a character class, circumflex
++ character is an assertion that is true only if the current matching
++ point is at the start of the subject string. If the startoffset argu-
++ ment of pcre_exec() is non-zero, circumflex can never match if the
++ PCRE_MULTILINE option is unset. Inside a character class, circumflex
+ has an entirely different meaning (see below).
+
+- Circumflex need not be the first character of the pattern if a number
+- of alternatives are involved, but it should be the first thing in each
+- alternative in which it appears if the pattern is ever to match that
+- branch. If all possible alternatives start with a circumflex, that is,
+- if the pattern is constrained to match only at the start of the sub-
+- ject, it is said to be an "anchored" pattern. (There are also other
++ Circumflex need not be the first character of the pattern if a number
++ of alternatives are involved, but it should be the first thing in each
++ alternative in which it appears if the pattern is ever to match that
++ branch. If all possible alternatives start with a circumflex, that is,
++ if the pattern is constrained to match only at the start of the sub-
++ ject, it is said to be an "anchored" pattern. (There are also other
+ constructs that can cause a pattern to be anchored.)
+
+- The dollar character is an assertion that is true only if the current
+- matching point is at the end of the subject string, or immediately
+- before a newline at the end of the string (by default). Note, however,
+- that it does not actually match the newline. Dollar need not be the
++ The dollar character is an assertion that is true only if the current
++ matching point is at the end of the subject string, or immediately
++ before a newline at the end of the string (by default). Note, however,
++ that it does not actually match the newline. Dollar need not be the
+ last character of the pattern if a number of alternatives are involved,
+- but it should be the last item in any branch in which it appears. Dol-
++ but it should be the last item in any branch in which it appears. Dol-
+ lar has no special meaning in a character class.
+
+- The meaning of dollar can be changed so that it matches only at the
+- very end of the string, by setting the PCRE_DOLLAR_ENDONLY option at
++ The meaning of dollar can be changed so that it matches only at the
++ very end of the string, by setting the PCRE_DOLLAR_ENDONLY option at
+ compile time. This does not affect the \Z assertion.
+
+ The meanings of the circumflex and dollar characters are changed if the
+- PCRE_MULTILINE option is set. When this is the case, a circumflex
+- matches immediately after internal newlines as well as at the start of
+- the subject string. It does not match after a newline that ends the
+- string. A dollar matches before any newlines in the string, as well as
+- at the very end, when PCRE_MULTILINE is set. When newline is specified
+- as the two-character sequence CRLF, isolated CR and LF characters do
++ PCRE_MULTILINE option is set. When this is the case, a circumflex
++ matches immediately after internal newlines as well as at the start of
++ the subject string. It does not match after a newline that ends the
++ string. A dollar matches before any newlines in the string, as well as
++ at the very end, when PCRE_MULTILINE is set. When newline is specified
++ as the two-character sequence CRLF, isolated CR and LF characters do
+ not indicate newlines.
+
+- For example, the pattern /^abc$/ matches the subject string "def\nabc"
+- (where \n represents a newline) in multiline mode, but not otherwise.
+- Consequently, patterns that are anchored in single line mode because
+- all branches start with ^ are not anchored in multiline mode, and a
+- match for circumflex is possible when the startoffset argument of
+- pcre_exec() is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if
++ For example, the pattern /^abc$/ matches the subject string "def\nabc"
++ (where \n represents a newline) in multiline mode, but not otherwise.
++ Consequently, patterns that are anchored in single line mode because
++ all branches start with ^ are not anchored in multiline mode, and a
++ match for circumflex is possible when the startoffset argument of
++ pcre_exec() is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if
+ PCRE_MULTILINE is set.
+
+- Note that the sequences \A, \Z, and \z can be used to match the start
+- and end of the subject in both modes, and if all branches of a pattern
+- start with \A it is always anchored, whether or not PCRE_MULTILINE is
++ Note that the sequences \A, \Z, and \z can be used to match the start
++ and end of the subject in both modes, and if all branches of a pattern
++ start with \A it is always anchored, whether or not PCRE_MULTILINE is
+ set.
+
+
+ FULL STOP (PERIOD, DOT) AND \N
+
+ Outside a character class, a dot in the pattern matches any one charac-
+- ter in the subject string except (by default) a character that signi-
++ ter in the subject string except (by default) a character that signi-
+ fies the end of a line.
+
+- When a line ending is defined as a single character, dot never matches
+- that character; when the two-character sequence CRLF is used, dot does
+- not match CR if it is immediately followed by LF, but otherwise it
+- matches all characters (including isolated CRs and LFs). When any Uni-
+- code line endings are being recognized, dot does not match CR or LF or
++ When a line ending is defined as a single character, dot never matches
++ that character; when the two-character sequence CRLF is used, dot does
++ not match CR if it is immediately followed by LF, but otherwise it
++ matches all characters (including isolated CRs and LFs). When any Uni-
++ code line endings are being recognized, dot does not match CR or LF or
+ any of the other line ending characters.
+
+- The behaviour of dot with regard to newlines can be changed. If the
+- PCRE_DOTALL option is set, a dot matches any one character, without
++ The behaviour of dot with regard to newlines can be changed. If the
++ PCRE_DOTALL option is set, a dot matches any one character, without
+ exception. If the two-character sequence CRLF is present in the subject
+ string, it takes two dots to match it.
+
+- The handling of dot is entirely independent of the handling of circum-
+- flex and dollar, the only relationship being that they both involve
++ The handling of dot is entirely independent of the handling of circum-
++ flex and dollar, the only relationship being that they both involve
+ newlines. Dot has no special meaning in a character class.
+
+- The escape sequence \N behaves like a dot, except that it is not
+- affected by the PCRE_DOTALL option. In other words, it matches any
+- character except one that signifies the end of a line. Perl also uses
++ The escape sequence \N behaves like a dot, except that it is not
++ affected by the PCRE_DOTALL option. In other words, it matches any
++ character except one that signifies the end of a line. Perl also uses
+ \N to match characters by name; PCRE does not support this.
+
+
+ MATCHING A SINGLE DATA UNIT
+
+- Outside a character class, the escape sequence \C matches any one data
+- unit, whether or not a UTF mode is set. In the 8-bit library, one data
+- unit is one byte; in the 16-bit library it is a 16-bit unit; in the
+- 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches
+- line-ending characters. The feature is provided in Perl in order to
++ Outside a character class, the escape sequence \C matches any one data
++ unit, whether or not a UTF mode is set. In the 8-bit library, one data
++ unit is one byte; in the 16-bit library it is a 16-bit unit; in the
++ 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches
++ line-ending characters. The feature is provided in Perl in order to
+ match individual bytes in UTF-8 mode, but it is unclear how it can use-
+- fully be used. Because \C breaks up characters into individual data
+- units, matching one unit with \C in a UTF mode means that the rest of
++ fully be used. Because \C breaks up characters into individual data
++ units, matching one unit with \C in a UTF mode means that the rest of
+ the string may start with a malformed UTF character. This has undefined
+ results, because PCRE assumes that it is dealing with valid UTF strings
+- (and by default it checks this at the start of processing unless the
+- PCRE_NO_UTF8_CHECK, PCRE_NO_UTF16_CHECK or PCRE_NO_UTF32_CHECK option
++ (and by default it checks this at the start of processing unless the
++ PCRE_NO_UTF8_CHECK, PCRE_NO_UTF16_CHECK or PCRE_NO_UTF32_CHECK option
+ is used).
+
+- PCRE does not allow \C to appear in lookbehind assertions (described
+- below) in a UTF mode, because this would make it impossible to calcu-
++ PCRE does not allow \C to appear in lookbehind assertions (described
++ below) in a UTF mode, because this would make it impossible to calcu-
+ late the length of the lookbehind.
+
+ In general, the \C escape sequence is best avoided. However, one way of
+- using it that avoids the problem of malformed UTF characters is to use
+- a lookahead to check the length of the next character, as in this pat-
+- tern, which could be used with a UTF-8 string (ignore white space and
++ using it that avoids the problem of malformed UTF characters is to use
++ a lookahead to check the length of the next character, as in this pat-
++ tern, which could be used with a UTF-8 string (ignore white space and
+ line breaks):
+
+ (?| (?=[\x00-\x7f])(\C) |
+@@ -5713,11 +5737,11 @@ MATCHING A SINGLE DATA UNIT
+ (?=[\x{800}-\x{ffff}])(\C)(\C)(\C) |
+ (?=[\x{10000}-\x{1fffff}])(\C)(\C)(\C)(\C))
+
+- A group that starts with (?| resets the capturing parentheses numbers
+- in each alternative (see "Duplicate Subpattern Numbers" below). The
+- assertions at the start of each branch check the next UTF-8 character
+- for values whose encoding uses 1, 2, 3, or 4 bytes, respectively. The
+- character's individual bytes are then captured by the appropriate num-
++ A group that starts with (?| resets the capturing parentheses numbers
++ in each alternative (see "Duplicate Subpattern Numbers" below). The
++ assertions at the start of each branch check the next UTF-8 character
++ for values whose encoding uses 1, 2, 3, or 4 bytes, respectively. The
++ character's individual bytes are then captured by the appropriate num-
+ ber of groups.
+
+
+@@ -5727,109 +5751,109 @@ SQUARE BRACKETS AND CHARACTER CLASSES
+ closing square bracket. A closing square bracket on its own is not spe-
+ cial by default. However, if the PCRE_JAVASCRIPT_COMPAT option is set,
+ a lone closing square bracket causes a compile-time error. If a closing
+- square bracket is required as a member of the class, it should be the
+- first data character in the class (after an initial circumflex, if
++ square bracket is required as a member of the class, it should be the
++ first data character in the class (after an initial circumflex, if
+ present) or escaped with a backslash.
+
+- A character class matches a single character in the subject. In a UTF
+- mode, the character may be more than one data unit long. A matched
++ A character class matches a single character in the subject. In a UTF
++ mode, the character may be more than one data unit long. A matched
+ character must be in the set of characters defined by the class, unless
+- the first character in the class definition is a circumflex, in which
++ the first character in the class definition is a circumflex, in which
+ case the subject character must not be in the set defined by the class.
+- If a circumflex is actually required as a member of the class, ensure
++ If a circumflex is actually required as a member of the class, ensure
+ it is not the first character, or escape it with a backslash.
+
+- For example, the character class [aeiou] matches any lower case vowel,
+- while [^aeiou] matches any character that is not a lower case vowel.
++ For example, the character class [aeiou] matches any lower case vowel,
++ while [^aeiou] matches any character that is not a lower case vowel.
+ Note that a circumflex is just a convenient notation for specifying the
+- characters that are in the class by enumerating those that are not. A
+- class that starts with a circumflex is not an assertion; it still con-
+- sumes a character from the subject string, and therefore it fails if
++ characters that are in the class by enumerating those that are not. A
++ class that starts with a circumflex is not an assertion; it still con-
++ sumes a character from the subject string, and therefore it fails if
+ the current pointer is at the end of the string.
+
+ In UTF-8 (UTF-16, UTF-32) mode, characters with values greater than 255
+- (0xffff) can be included in a class as a literal string of data units,
++ (0xffff) can be included in a class as a literal string of data units,
+ or by using the \x{ escaping mechanism.
+
+- When caseless matching is set, any letters in a class represent both
+- their upper case and lower case versions, so for example, a caseless
+- [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
+- match "A", whereas a caseful version would. In a UTF mode, PCRE always
+- understands the concept of case for characters whose values are less
+- than 128, so caseless matching is always possible. For characters with
+- higher values, the concept of case is supported if PCRE is compiled
+- with Unicode property support, but not otherwise. If you want to use
+- caseless matching in a UTF mode for characters 128 and above, you must
+- ensure that PCRE is compiled with Unicode property support as well as
++ When caseless matching is set, any letters in a class represent both
++ their upper case and lower case versions, so for example, a caseless
++ [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
++ match "A", whereas a caseful version would. In a UTF mode, PCRE always
++ understands the concept of case for characters whose values are less
++ than 128, so caseless matching is always possible. For characters with
++ higher values, the concept of case is supported if PCRE is compiled
++ with Unicode property support, but not otherwise. If you want to use
++ caseless matching in a UTF mode for characters 128 and above, you must
++ ensure that PCRE is compiled with Unicode property support as well as
+ with UTF support.
+
+- Characters that might indicate line breaks are never treated in any
+- special way when matching character classes, whatever line-ending
+- sequence is in use, and whatever setting of the PCRE_DOTALL and
++ Characters that might indicate line breaks are never treated in any
++ special way when matching character classes, whatever line-ending
++ sequence is in use, and whatever setting of the PCRE_DOTALL and
+ PCRE_MULTILINE options is used. A class such as [^a] always matches one
+ of these characters.
+
+- The minus (hyphen) character can be used to specify a range of charac-
+- ters in a character class. For example, [d-m] matches any letter
+- between d and m, inclusive. If a minus character is required in a
+- class, it must be escaped with a backslash or appear in a position
+- where it cannot be interpreted as indicating a range, typically as the
++ The minus (hyphen) character can be used to specify a range of charac-
++ ters in a character class. For example, [d-m] matches any letter
++ between d and m, inclusive. If a minus character is required in a
++ class, it must be escaped with a backslash or appear in a position
++ where it cannot be interpreted as indicating a range, typically as the
+ first or last character in the class, or immediately after a range. For
+- example, [b-d-z] matches letters in the range b to d, a hyphen charac-
++ example, [b-d-z] matches letters in the range b to d, a hyphen charac-
+ ter, or z.
+
+ It is not possible to have the literal character "]" as the end charac-
+- ter of a range. A pattern such as [W-]46] is interpreted as a class of
+- two characters ("W" and "-") followed by a literal string "46]", so it
+- would match "W46]" or "-46]". However, if the "]" is escaped with a
+- backslash it is interpreted as the end of range, so [W-\]46] is inter-
+- preted as a class containing a range followed by two other characters.
+- The octal or hexadecimal representation of "]" can also be used to end
++ ter of a range. A pattern such as [W-]46] is interpreted as a class of
++ two characters ("W" and "-") followed by a literal string "46]", so it
++ would match "W46]" or "-46]". However, if the "]" is escaped with a
++ backslash it is interpreted as the end of range, so [W-\]46] is inter-
++ preted as a class containing a range followed by two other characters.
++ The octal or hexadecimal representation of "]" can also be used to end
+ a range.
+
+- An error is generated if a POSIX character class (see below) or an
+- escape sequence other than one that defines a single character appears
+- at a point where a range ending character is expected. For example,
++ An error is generated if a POSIX character class (see below) or an
++ escape sequence other than one that defines a single character appears
++ at a point where a range ending character is expected. For example,
+ [z-\xff] is valid, but [A-\d] and [A-[:digit:]] are not.
+
+- Ranges operate in the collating sequence of character values. They can
+- also be used for characters specified numerically, for example
+- [\000-\037]. Ranges can include any characters that are valid for the
++ Ranges operate in the collating sequence of character values. They can
++ also be used for characters specified numerically, for example
++ [\000-\037]. Ranges can include any characters that are valid for the
+ current mode.
+
+ If a range that includes letters is used when caseless matching is set,
+ it matches the letters in either case. For example, [W-c] is equivalent
+- to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if
+- character tables for a French locale are in use, [\xc8-\xcb] matches
+- accented E characters in both cases. In UTF modes, PCRE supports the
+- concept of case for characters with values greater than 128 only when
++ to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if
++ character tables for a French locale are in use, [\xc8-\xcb] matches
++ accented E characters in both cases. In UTF modes, PCRE supports the
++ concept of case for characters with values greater than 128 only when
+ it is compiled with Unicode property support.
+
+- The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v, \V,
++ The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v, \V,
+ \w, and \W may appear in a character class, and add the characters that
+- they match to the class. For example, [\dABCDEF] matches any hexadeci-
+- mal digit. In UTF modes, the PCRE_UCP option affects the meanings of
+- \d, \s, \w and their upper case partners, just as it does when they
+- appear outside a character class, as described in the section entitled
++ they match to the class. For example, [\dABCDEF] matches any hexadeci-
++ mal digit. In UTF modes, the PCRE_UCP option affects the meanings of
++ \d, \s, \w and their upper case partners, just as it does when they
++ appear outside a character class, as described in the section entitled
+ "Generic character types" above. The escape sequence \b has a different
+- meaning inside a character class; it matches the backspace character.
+- The sequences \B, \N, \R, and \X are not special inside a character
+- class. Like any other unrecognized escape sequences, they are treated
+- as the literal characters "B", "N", "R", and "X" by default, but cause
++ meaning inside a character class; it matches the backspace character.
++ The sequences \B, \N, \R, and \X are not special inside a character
++ class. Like any other unrecognized escape sequences, they are treated
++ as the literal characters "B", "N", "R", and "X" by default, but cause
+ an error if the PCRE_EXTRA option is set.
+
+- A circumflex can conveniently be used with the upper case character
+- types to specify a more restricted set of characters than the matching
+- lower case type. For example, the class [^\W_] matches any letter or
++ A circumflex can conveniently be used with the upper case character
++ types to specify a more restricted set of characters than the matching
++ lower case type. For example, the class [^\W_] matches any letter or
+ digit, but not underscore, whereas [\w] includes underscore. A positive
+ character class should be read as "something OR something OR ..." and a
+ negative class as "NOT something AND NOT something AND NOT ...".
+
+- The only metacharacters that are recognized in character classes are
+- backslash, hyphen (only where it can be interpreted as specifying a
+- range), circumflex (only at the start), opening square bracket (only
+- when it can be interpreted as introducing a POSIX class name, or for a
+- special compatibility feature - see the next two sections), and the
++ The only metacharacters that are recognized in character classes are
++ backslash, hyphen (only where it can be interpreted as specifying a
++ range), circumflex (only at the start), opening square bracket (only
++ when it can be interpreted as introducing a POSIX class name, or for a
++ special compatibility feature - see the next two sections), and the
+ terminating closing square bracket. However, escaping other non-
+ alphanumeric characters does no harm.
+
+@@ -5837,7 +5861,7 @@ SQUARE BRACKETS AND CHARACTER CLASSES
+ POSIX CHARACTER CLASSES
+
+ Perl supports the POSIX notation for character classes. This uses names
+- enclosed by [: and :] within the enclosing square brackets. PCRE also
++ enclosed by [: and :] within the enclosing square brackets. PCRE also
+ supports this notation. For example,
+
+ [01[:alpha:]%]
+@@ -5860,28 +5884,28 @@ POSIX CHARACTER CLASSES
+ word "word" characters (same as \w)
+ xdigit hexadecimal digits
+
+- The default "space" characters are HT (9), LF (10), VT (11), FF (12),
+- CR (13), and space (32). If locale-specific matching is taking place,
+- the list of space characters may be different; there may be fewer or
++ The default "space" characters are HT (9), LF (10), VT (11), FF (12),
++ CR (13), and space (32). If locale-specific matching is taking place,
++ the list of space characters may be different; there may be fewer or
+ more of them. "Space" used to be different to \s, which did not include
+ VT, for Perl compatibility. However, Perl changed at release 5.18, and
+- PCRE followed at release 8.34. "Space" and \s now match the same set
++ PCRE followed at release 8.34. "Space" and \s now match the same set
+ of characters.
+
+- The name "word" is a Perl extension, and "blank" is a GNU extension
+- from Perl 5.8. Another Perl extension is negation, which is indicated
++ The name "word" is a Perl extension, and "blank" is a GNU extension
++ from Perl 5.8. Another Perl extension is negation, which is indicated
+ by a ^ character after the colon. For example,
+
+ [12[:^digit:]]
+
+- matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the
++ matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the
+ POSIX syntax [.ch.] and [=ch=] where "ch" is a "collating element", but
+ these are not supported, and an error is given if they are encountered.
+
+ By default, characters with values greater than 128 do not match any of
+- the POSIX character classes. However, if the PCRE_UCP option is passed
+- to pcre_compile(), some of the classes are changed so that Unicode
+- character properties are used. This is achieved by replacing certain
++ the POSIX character classes. However, if the PCRE_UCP option is passed
++ to pcre_compile(), some of the classes are changed so that Unicode
++ character properties are used. This is achieved by replacing certain
+ POSIX classes by other sequences, as follows:
+
+ [:alnum:] becomes \p{Xan}
+@@ -5893,10 +5917,10 @@ POSIX CHARACTER CLASSES
+ [:upper:] becomes \p{Lu}
+ [:word:] becomes \p{Xwd}
+
+- Negated versions, such as [:^alpha:] use \P instead of \p. Three other
++ Negated versions, such as [:^alpha:] use \P instead of \p. Three other
+ POSIX classes are handled specially in UCP mode:
+
+- [:graph:] This matches characters that have glyphs that mark the page
++ [:graph:] This matches characters that have glyphs that mark the page
+ when printed. In Unicode property terms, it matches all char-
+ acters with the L, M, N, P, S, or Cf properties, except for:
+
+@@ -5905,58 +5929,58 @@ POSIX CHARACTER CLASSES
+ U+2066 - U+2069 Various "isolate"s
+
+
+- [:print:] This matches the same characters as [:graph:] plus space
+- characters that are not controls, that is, characters with
++ [:print:] This matches the same characters as [:graph:] plus space
++ characters that are not controls, that is, characters with
+ the Zs property.
+
+ [:punct:] This matches all characters that have the Unicode P (punctua-
+- tion) property, plus those characters whose code points are
++ tion) property, plus those characters whose code points are
+ less than 128 that have the S (Symbol) property.
+
+- The other POSIX classes are unchanged, and match only characters with
++ The other POSIX classes are unchanged, and match only characters with
+ code points less than 128.
+
+
+ COMPATIBILITY FEATURE FOR WORD BOUNDARIES
+
+- In the POSIX.2 compliant library that was included in 4.4BSD Unix, the
+- ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word"
++ In the POSIX.2 compliant library that was included in 4.4BSD Unix, the
++ ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word"
+ and "end of word". PCRE treats these items as follows:
+
+ [[:<:]] is converted to \b(?=\w)
+ [[:>:]] is converted to \b(?<=\w)
+
+ Only these exact character sequences are recognized. A sequence such as
+- [a[:<:]b] provokes error for an unrecognized POSIX class name. This
+- support is not compatible with Perl. It is provided to help migrations
++ [a[:<:]b] provokes error for an unrecognized POSIX class name. This
++ support is not compatible with Perl. It is provided to help migrations
+ from other environments, and is best not used in any new patterns. Note
+- that \b matches at the start and the end of a word (see "Simple asser-
+- tions" above), and in a Perl-style pattern the preceding or following
+- character normally shows which is wanted, without the need for the
+- assertions that are used above in order to give exactly the POSIX be-
++ that \b matches at the start and the end of a word (see "Simple asser-
++ tions" above), and in a Perl-style pattern the preceding or following
++ character normally shows which is wanted, without the need for the
++ assertions that are used above in order to give exactly the POSIX be-
+ haviour.
+
+
+ VERTICAL BAR
+
+- Vertical bar characters are used to separate alternative patterns. For
++ Vertical bar characters are used to separate alternative patterns. For
+ example, the pattern
+
+ gilbert|sullivan
+
+- matches either "gilbert" or "sullivan". Any number of alternatives may
+- appear, and an empty alternative is permitted (matching the empty
++ matches either "gilbert" or "sullivan". Any number of alternatives may
++ appear, and an empty alternative is permitted (matching the empty
+ string). The matching process tries each alternative in turn, from left
+- to right, and the first one that succeeds is used. If the alternatives
+- are within a subpattern (defined below), "succeeds" means matching the
++ to right, and the first one that succeeds is used. If the alternatives
++ are within a subpattern (defined below), "succeeds" means matching the
+ rest of the main pattern as well as the alternative in the subpattern.
+
+
+ INTERNAL OPTION SETTING
+
+- The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and
+- PCRE_EXTENDED options (which are Perl-compatible) can be changed from
+- within the pattern by a sequence of Perl option letters enclosed
++ The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and
++ PCRE_EXTENDED options (which are Perl-compatible) can be changed from
++ within the pattern by a sequence of Perl option letters enclosed
+ between "(?" and ")". The option letters are
+
+ i for PCRE_CASELESS
+@@ -5966,51 +5990,51 @@ INTERNAL OPTION SETTING
+
+ For example, (?im) sets caseless, multiline matching. It is also possi-
+ ble to unset these options by preceding the letter with a hyphen, and a
+- combined setting and unsetting such as (?im-sx), which sets PCRE_CASE-
+- LESS and PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED,
+- is also permitted. If a letter appears both before and after the
++ combined setting and unsetting such as (?im-sx), which sets PCRE_CASE-
++ LESS and PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED,
++ is also permitted. If a letter appears both before and after the
+ hyphen, the option is unset.
+
+- The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA
+- can be changed in the same way as the Perl-compatible options by using
++ The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA
++ can be changed in the same way as the Perl-compatible options by using
+ the characters J, U and X respectively.
+
+- When one of these option changes occurs at top level (that is, not
+- inside subpattern parentheses), the change applies to the remainder of
++ When one of these option changes occurs at top level (that is, not
++ inside subpattern parentheses), the change applies to the remainder of
+ the pattern that follows. If the change is placed right at the start of
+ a pattern, PCRE extracts it into the global options (and it will there-
+ fore show up in data extracted by the pcre_fullinfo() function).
+
+- An option change within a subpattern (see below for a description of
+- subpatterns) affects only that part of the subpattern that follows it,
++ An option change within a subpattern (see below for a description of
++ subpatterns) affects only that part of the subpattern that follows it,
+ so
+
+ (a(?i)b)c
+
+ matches abc and aBc and no other strings (assuming PCRE_CASELESS is not
+- used). By this means, options can be made to have different settings
+- in different parts of the pattern. Any changes made in one alternative
+- do carry on into subsequent branches within the same subpattern. For
++ used). By this means, options can be made to have different settings
++ in different parts of the pattern. Any changes made in one alternative
++ do carry on into subsequent branches within the same subpattern. For
+ example,
+
+ (a(?i)b|c)
+
+- matches "ab", "aB", "c", and "C", even though when matching "C" the
+- first branch is abandoned before the option setting. This is because
+- the effects of option settings happen at compile time. There would be
++ matches "ab", "aB", "c", and "C", even though when matching "C" the
++ first branch is abandoned before the option setting. This is because
++ the effects of option settings happen at compile time. There would be
+ some very weird behaviour otherwise.
+
+- Note: There are other PCRE-specific options that can be set by the
+- application when the compiling or matching functions are called. In
+- some cases the pattern can contain special leading sequences such as
+- (*CRLF) to override what the application has set or what has been
+- defaulted. Details are given in the section entitled "Newline
+- sequences" above. There are also the (*UTF8), (*UTF16),(*UTF32), and
+- (*UCP) leading sequences that can be used to set UTF and Unicode prop-
+- erty modes; they are equivalent to setting the PCRE_UTF8, PCRE_UTF16,
+- PCRE_UTF32 and the PCRE_UCP options, respectively. The (*UTF) sequence
+- is a generic version that can be used with any of the libraries. How-
+- ever, the application can set the PCRE_NEVER_UTF option, which locks
++ Note: There are other PCRE-specific options that can be set by the
++ application when the compiling or matching functions are called. In
++ some cases the pattern can contain special leading sequences such as
++ (*CRLF) to override what the application has set or what has been
++ defaulted. Details are given in the section entitled "Newline
++ sequences" above. There are also the (*UTF8), (*UTF16),(*UTF32), and
++ (*UCP) leading sequences that can be used to set UTF and Unicode prop-
++ erty modes; they are equivalent to setting the PCRE_UTF8, PCRE_UTF16,
++ PCRE_UTF32 and the PCRE_UCP options, respectively. The (*UTF) sequence
++ is a generic version that can be used with any of the libraries. How-
++ ever, the application can set the PCRE_NEVER_UTF option, which locks
+ out the use of the (*UTF) sequences.
+
+
+@@ -6023,18 +6047,18 @@ SUBPATTERNS
+
+ cat(aract|erpillar|)
+
+- matches "cataract", "caterpillar", or "cat". Without the parentheses,
++ matches "cataract", "caterpillar", or "cat". Without the parentheses,
+ it would match "cataract", "erpillar" or an empty string.
+
+- 2. It sets up the subpattern as a capturing subpattern. This means
+- that, when the whole pattern matches, that portion of the subject
++ 2. It sets up the subpattern as a capturing subpattern. This means
++ that, when the whole pattern matches, that portion of the subject
+ string that matched the subpattern is passed back to the caller via the
+- ovector argument of the matching function. (This applies only to the
+- traditional matching functions; the DFA matching functions do not sup-
++ ovector argument of the matching function. (This applies only to the
++ traditional matching functions; the DFA matching functions do not sup-
+ port capturing.)
+
+ Opening parentheses are counted from left to right (starting from 1) to
+- obtain numbers for the capturing subpatterns. For example, if the
++ obtain numbers for the capturing subpatterns. For example, if the
+ string "the red king" is matched against the pattern
+
+ the ((red|white) (king|queen))
+@@ -6042,12 +6066,12 @@ SUBPATTERNS
+ the captured substrings are "red king", "red", and "king", and are num-
+ bered 1, 2, and 3, respectively.
+
+- The fact that plain parentheses fulfil two functions is not always
+- helpful. There are often times when a grouping subpattern is required
+- without a capturing requirement. If an opening parenthesis is followed
+- by a question mark and a colon, the subpattern does not do any captur-
+- ing, and is not counted when computing the number of any subsequent
+- capturing subpatterns. For example, if the string "the white queen" is
++ The fact that plain parentheses fulfil two functions is not always
++ helpful. There are often times when a grouping subpattern is required
++ without a capturing requirement. If an opening parenthesis is followed
++ by a question mark and a colon, the subpattern does not do any captur-
++ ing, and is not counted when computing the number of any subsequent
++ capturing subpatterns. For example, if the string "the white queen" is
+ matched against the pattern
+
+ the ((?:red|white) (king|queen))
+@@ -6055,37 +6079,37 @@ SUBPATTERNS
+ the captured substrings are "white queen" and "queen", and are numbered
+ 1 and 2. The maximum number of capturing subpatterns is 65535.
+
+- As a convenient shorthand, if any option settings are required at the
+- start of a non-capturing subpattern, the option letters may appear
++ As a convenient shorthand, if any option settings are required at the
++ start of a non-capturing subpattern, the option letters may appear
+ between the "?" and the ":". Thus the two patterns
+
+ (?i:saturday|sunday)
+ (?:(?i)saturday|sunday)
+
+ match exactly the same set of strings. Because alternative branches are
+- tried from left to right, and options are not reset until the end of
+- the subpattern is reached, an option setting in one branch does affect
+- subsequent branches, so the above patterns match "SUNDAY" as well as
++ tried from left to right, and options are not reset until the end of
++ the subpattern is reached, an option setting in one branch does affect
++ subsequent branches, so the above patterns match "SUNDAY" as well as
+ "Saturday".
+
+
+ DUPLICATE SUBPATTERN NUMBERS
+
+ Perl 5.10 introduced a feature whereby each alternative in a subpattern
+- uses the same numbers for its capturing parentheses. Such a subpattern
+- starts with (?| and is itself a non-capturing subpattern. For example,
++ uses the same numbers for its capturing parentheses. Such a subpattern
++ starts with (?| and is itself a non-capturing subpattern. For example,
+ consider this pattern:
+
+ (?|(Sat)ur|(Sun))day
+
+- Because the two alternatives are inside a (?| group, both sets of cap-
+- turing parentheses are numbered one. Thus, when the pattern matches,
+- you can look at captured substring number one, whichever alternative
+- matched. This construct is useful when you want to capture part, but
++ Because the two alternatives are inside a (?| group, both sets of cap-
++ turing parentheses are numbered one. Thus, when the pattern matches,
++ you can look at captured substring number one, whichever alternative
++ matched. This construct is useful when you want to capture part, but
+ not all, of one of a number of alternatives. Inside a (?| group, paren-
+- theses are numbered as usual, but the number is reset at the start of
+- each branch. The numbers of any capturing parentheses that follow the
+- subpattern start after the highest number used in any branch. The fol-
++ theses are numbered as usual, but the number is reset at the start of
++ each branch. The numbers of any capturing parentheses that follow the
++ subpattern start after the highest number used in any branch. The fol-
+ lowing example is taken from the Perl documentation. The numbers under-
+ neath show in which buffer the captured content will be stored.
+
+@@ -6093,58 +6117,58 @@ DUPLICATE SUBPATTERN NUMBERS
+ / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
+ # 1 2 2 3 2 3 4
+
+- A back reference to a numbered subpattern uses the most recent value
+- that is set for that number by any subpattern. The following pattern
++ A back reference to a numbered subpattern uses the most recent value
++ that is set for that number by any subpattern. The following pattern
+ matches "abcabc" or "defdef":
+
+ /(?|(abc)|(def))\1/
+
+- In contrast, a subroutine call to a numbered subpattern always refers
+- to the first one in the pattern with the given number. The following
++ In contrast, a subroutine call to a numbered subpattern always refers
++ to the first one in the pattern with the given number. The following
+ pattern matches "abcabc" or "defabc":
+
+ /(?|(abc)|(def))(?1)/
+
+- If a condition test for a subpattern's having matched refers to a non-
+- unique number, the test is true if any of the subpatterns of that num-
++ If a condition test for a subpattern's having matched refers to a non-
++ unique number, the test is true if any of the subpatterns of that num-
+ ber have matched.
+
+- An alternative approach to using this "branch reset" feature is to use
++ An alternative approach to using this "branch reset" feature is to use
+ duplicate named subpatterns, as described in the next section.
+
+
+ NAMED SUBPATTERNS
+
+- Identifying capturing parentheses by number is simple, but it can be
+- very hard to keep track of the numbers in complicated regular expres-
+- sions. Furthermore, if an expression is modified, the numbers may
+- change. To help with this difficulty, PCRE supports the naming of sub-
++ Identifying capturing parentheses by number is simple, but it can be
++ very hard to keep track of the numbers in complicated regular expres-
++ sions. Furthermore, if an expression is modified, the numbers may
++ change. To help with this difficulty, PCRE supports the naming of sub-
+ patterns. This feature was not added to Perl until release 5.10. Python
+- had the feature earlier, and PCRE introduced it at release 4.0, using
+- the Python syntax. PCRE now supports both the Perl and the Python syn-
+- tax. Perl allows identically numbered subpatterns to have different
++ had the feature earlier, and PCRE introduced it at release 4.0, using
++ the Python syntax. PCRE now supports both the Perl and the Python syn-
++ tax. Perl allows identically numbered subpatterns to have different
+ names, but PCRE does not.
+
+- In PCRE, a subpattern can be named in one of three ways: (?<name>...)
+- or (?'name'...) as in Perl, or (?P<name>...) as in Python. References
+- to capturing parentheses from other parts of the pattern, such as back
+- references, recursion, and conditions, can be made by name as well as
++ In PCRE, a subpattern can be named in one of three ways: (?<name>...)
++ or (?'name'...) as in Perl, or (?P<name>...) as in Python. References
++ to capturing parentheses from other parts of the pattern, such as back
++ references, recursion, and conditions, can be made by name as well as
+ by number.
+
+- Names consist of up to 32 alphanumeric characters and underscores, but
+- must start with a non-digit. Named capturing parentheses are still
+- allocated numbers as well as names, exactly as if the names were not
+- present. The PCRE API provides function calls for extracting the name-
+- to-number translation table from a compiled pattern. There is also a
++ Names consist of up to 32 alphanumeric characters and underscores, but
++ must start with a non-digit. Named capturing parentheses are still
++ allocated numbers as well as names, exactly as if the names were not
++ present. The PCRE API provides function calls for extracting the name-
++ to-number translation table from a compiled pattern. There is also a
+ convenience function for extracting a captured substring by name.
+
+- By default, a name must be unique within a pattern, but it is possible
++ By default, a name must be unique within a pattern, but it is possible
+ to relax this constraint by setting the PCRE_DUPNAMES option at compile
+- time. (Duplicate names are also always permitted for subpatterns with
+- the same number, set up as described in the previous section.) Dupli-
+- cate names can be useful for patterns where only one instance of the
+- named parentheses can match. Suppose you want to match the name of a
+- weekday, either as a 3-letter abbreviation or as the full name, and in
++ time. (Duplicate names are also always permitted for subpatterns with
++ the same number, set up as described in the previous section.) Dupli-
++ cate names can be useful for patterns where only one instance of the
++ named parentheses can match. Suppose you want to match the name of a
++ weekday, either as a 3-letter abbreviation or as the full name, and in
+ both cases you want to extract the abbreviation. This pattern (ignoring
+ the line breaks) does the job:
+
+@@ -6154,18 +6178,18 @@ NAMED SUBPATTERNS
+ (?<DN>Thu)(?:rsday)?|
+ (?<DN>Sat)(?:urday)?
+
+- There are five capturing substrings, but only one is ever set after a
++ There are five capturing substrings, but only one is ever set after a
+ match. (An alternative way of solving this problem is to use a "branch
+ reset" subpattern, as described in the previous section.)
+
+- The convenience function for extracting the data by name returns the
+- substring for the first (and in this example, the only) subpattern of
+- that name that matched. This saves searching to find which numbered
++ The convenience function for extracting the data by name returns the
++ substring for the first (and in this example, the only) subpattern of
++ that name that matched. This saves searching to find which numbered
+ subpattern it was.
+
+- If you make a back reference to a non-unique named subpattern from
+- elsewhere in the pattern, the subpatterns to which the name refers are
+- checked in the order in which they appear in the overall pattern. The
++ If you make a back reference to a non-unique named subpattern from
++ elsewhere in the pattern, the subpatterns to which the name refers are
++ checked in the order in which they appear in the overall pattern. The
+ first one that is set is used for the reference. For example, this pat-
+ tern matches both "foofoo" and "barbar" but not "foobar" or "barfoo":
+
+@@ -6173,29 +6197,29 @@ NAMED SUBPATTERNS
+
+
+ If you make a subroutine call to a non-unique named subpattern, the one
+- that corresponds to the first occurrence of the name is used. In the
++ that corresponds to the first occurrence of the name is used. In the
+ absence of duplicate numbers (see the previous section) this is the one
+ with the lowest number.
+
+ If you use a named reference in a condition test (see the section about
+ conditions below), either to check whether a subpattern has matched, or
+- to check for recursion, all subpatterns with the same name are tested.
+- If the condition is true for any one of them, the overall condition is
+- true. This is the same behaviour as testing by number. For further
+- details of the interfaces for handling named subpatterns, see the
++ to check for recursion, all subpatterns with the same name are tested.
++ If the condition is true for any one of them, the overall condition is
++ true. This is the same behaviour as testing by number. For further
++ details of the interfaces for handling named subpatterns, see the
+ pcreapi documentation.
+
+ Warning: You cannot use different names to distinguish between two sub-
+- patterns with the same number because PCRE uses only the numbers when
++ patterns with the same number because PCRE uses only the numbers when
+ matching. For this reason, an error is given at compile time if differ-
+- ent names are given to subpatterns with the same number. However, you
++ ent names are given to subpatterns with the same number. However, you
+ can always give the same name to subpatterns with the same number, even
+ when PCRE_DUPNAMES is not set.
+
+
+ REPETITION
+
+- Repetition is specified by quantifiers, which can follow any of the
++ Repetition is specified by quantifiers, which can follow any of the
+ following items:
+
+ a literal data character
+@@ -6209,17 +6233,17 @@ REPETITION
+ a parenthesized subpattern (including assertions)
+ a subroutine call to a subpattern (recursive or otherwise)
+
+- The general repetition quantifier specifies a minimum and maximum num-
+- ber of permitted matches, by giving the two numbers in curly brackets
+- (braces), separated by a comma. The numbers must be less than 65536,
++ The general repetition quantifier specifies a minimum and maximum num-
++ ber of permitted matches, by giving the two numbers in curly brackets
++ (braces), separated by a comma. The numbers must be less than 65536,
+ and the first must be less than or equal to the second. For example:
+
+ z{2,4}
+
+- matches "zz", "zzz", or "zzzz". A closing brace on its own is not a
+- special character. If the second number is omitted, but the comma is
+- present, there is no upper limit; if the second number and the comma
+- are both omitted, the quantifier specifies an exact number of required
++ matches "zz", "zzz", or "zzzz". A closing brace on its own is not a
++ special character. If the second number is omitted, but the comma is
++ present, there is no upper limit; if the second number and the comma
++ are both omitted, the quantifier specifies an exact number of required
+ matches. Thus
+
+ [aeiou]{3,}
+@@ -6228,50 +6252,50 @@ REPETITION
+
+ \d{8}
+
+- matches exactly 8 digits. An opening curly bracket that appears in a
+- position where a quantifier is not allowed, or one that does not match
+- the syntax of a quantifier, is taken as a literal character. For exam-
++ matches exactly 8 digits. An opening curly bracket that appears in a
++ position where a quantifier is not allowed, or one that does not match
++ the syntax of a quantifier, is taken as a literal character. For exam-
+ ple, {,6} is not a quantifier, but a literal string of four characters.
+
+ In UTF modes, quantifiers apply to characters rather than to individual
+- data units. Thus, for example, \x{100}{2} matches two characters, each
++ data units. Thus, for example, \x{100}{2} matches two characters, each
+ of which is represented by a two-byte sequence in a UTF-8 string. Simi-
+- larly, \X{3} matches three Unicode extended grapheme clusters, each of
+- which may be several data units long (and they may be of different
++ larly, \X{3} matches three Unicode extended grapheme clusters, each of
++ which may be several data units long (and they may be of different
+ lengths).
+
+ The quantifier {0} is permitted, causing the expression to behave as if
+ the previous item and the quantifier were not present. This may be use-
+- ful for subpatterns that are referenced as subroutines from elsewhere
++ ful for subpatterns that are referenced as subroutines from elsewhere
+ in the pattern (but see also the section entitled "Defining subpatterns
+- for use by reference only" below). Items other than subpatterns that
++ for use by reference only" below). Items other than subpatterns that
+ have a {0} quantifier are omitted from the compiled pattern.
+
+- For convenience, the three most common quantifiers have single-charac-
++ For convenience, the three most common quantifiers have single-charac-
+ ter abbreviations:
+
+ * is equivalent to {0,}
+ + is equivalent to {1,}
+ ? is equivalent to {0,1}
+
+- It is possible to construct infinite loops by following a subpattern
++ It is possible to construct infinite loops by following a subpattern
+ that can match no characters with a quantifier that has no upper limit,
+ for example:
+
+ (a?)*
+
+ Earlier versions of Perl and PCRE used to give an error at compile time
+- for such patterns. However, because there are cases where this can be
+- useful, such patterns are now accepted, but if any repetition of the
+- subpattern does in fact match no characters, the loop is forcibly bro-
++ for such patterns. However, because there are cases where this can be
++ useful, such patterns are now accepted, but if any repetition of the
++ subpattern does in fact match no characters, the loop is forcibly bro-
+ ken.
+
+- By default, the quantifiers are "greedy", that is, they match as much
+- as possible (up to the maximum number of permitted times), without
+- causing the rest of the pattern to fail. The classic example of where
++ By default, the quantifiers are "greedy", that is, they match as much
++ as possible (up to the maximum number of permitted times), without
++ causing the rest of the pattern to fail. The classic example of where
+ this gives problems is in trying to match comments in C programs. These
+- appear between /* and */ and within the comment, individual * and /
+- characters may appear. An attempt to match C comments by applying the
++ appear between /* and */ and within the comment, individual * and /
++ characters may appear. An attempt to match C comments by applying the
+ pattern
+
+ /\*.*\*/
+@@ -6280,19 +6304,19 @@ REPETITION
+
+ /* first comment */ not comment /* second comment */
+
+- fails, because it matches the entire string owing to the greediness of
++ fails, because it matches the entire string owing to the greediness of
+ the .* item.
+
+- However, if a quantifier is followed by a question mark, it ceases to
++ However, if a quantifier is followed by a question mark, it ceases to
+ be greedy, and instead matches the minimum number of times possible, so
+ the pattern
+
+ /\*.*?\*/
+
+- does the right thing with the C comments. The meaning of the various
+- quantifiers is not otherwise changed, just the preferred number of
+- matches. Do not confuse this use of question mark with its use as a
+- quantifier in its own right. Because it has two uses, it can sometimes
++ does the right thing with the C comments. The meaning of the various
++ quantifiers is not otherwise changed, just the preferred number of
++ matches. Do not confuse this use of question mark with its use as a
++ quantifier in its own right. Because it has two uses, it can sometimes
+ appear doubled, as in
+
+ \d??\d
+@@ -6300,45 +6324,45 @@ REPETITION
+ which matches one digit by preference, but can match two if that is the
+ only way the rest of the pattern matches.
+
+- If the PCRE_UNGREEDY option is set (an option that is not available in
+- Perl), the quantifiers are not greedy by default, but individual ones
+- can be made greedy by following them with a question mark. In other
++ If the PCRE_UNGREEDY option is set (an option that is not available in
++ Perl), the quantifiers are not greedy by default, but individual ones
++ can be made greedy by following them with a question mark. In other
+ words, it inverts the default behaviour.
+
+- When a parenthesized subpattern is quantified with a minimum repeat
+- count that is greater than 1 or with a limited maximum, more memory is
+- required for the compiled pattern, in proportion to the size of the
++ When a parenthesized subpattern is quantified with a minimum repeat
++ count that is greater than 1 or with a limited maximum, more memory is
++ required for the compiled pattern, in proportion to the size of the
+ minimum or maximum.
+
+ If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equiv-
+- alent to Perl's /s) is set, thus allowing the dot to match newlines,
+- the pattern is implicitly anchored, because whatever follows will be
+- tried against every character position in the subject string, so there
+- is no point in retrying the overall match at any position after the
+- first. PCRE normally treats such a pattern as though it were preceded
++ alent to Perl's /s) is set, thus allowing the dot to match newlines,
++ the pattern is implicitly anchored, because whatever follows will be
++ tried against every character position in the subject string, so there
++ is no point in retrying the overall match at any position after the
++ first. PCRE normally treats such a pattern as though it were preceded
+ by \A.
+
+- In cases where it is known that the subject string contains no new-
+- lines, it is worth setting PCRE_DOTALL in order to obtain this opti-
++ In cases where it is known that the subject string contains no new-
++ lines, it is worth setting PCRE_DOTALL in order to obtain this opti-
+ mization, or alternatively using ^ to indicate anchoring explicitly.
+
+- However, there are some cases where the optimization cannot be used.
++ However, there are some cases where the optimization cannot be used.
+ When .* is inside capturing parentheses that are the subject of a back
+ reference elsewhere in the pattern, a match at the start may fail where
+ a later one succeeds. Consider, for example:
+
+ (.*)abc\1
+
+- If the subject is "xyz123abc123" the match point is the fourth charac-
++ If the subject is "xyz123abc123" the match point is the fourth charac-
+ ter. For this reason, such a pattern is not implicitly anchored.
+
+- Another case where implicit anchoring is not applied is when the lead-
+- ing .* is inside an atomic group. Once again, a match at the start may
++ Another case where implicit anchoring is not applied is when the lead-
++ ing .* is inside an atomic group. Once again, a match at the start may
+ fail where a later one succeeds. Consider this pattern:
+
+ (?>.*?a)b
+
+- It matches "ab" in the subject "aab". The use of the backtracking con-
++ It matches "ab" in the subject "aab". The use of the backtracking con-
+ trol verbs (*PRUNE) and (*SKIP) also disable this optimization.
+
+ When a capturing subpattern is repeated, the value captured is the sub-
+@@ -6347,8 +6371,8 @@ REPETITION
+ (tweedle[dume]{3}\s*)+
+
+ has matched "tweedledum tweedledee" the value of the captured substring
+- is "tweedledee". However, if there are nested capturing subpatterns,
+- the corresponding captured values may have been set in previous itera-
++ is "tweedledee". However, if there are nested capturing subpatterns,
++ the corresponding captured values may have been set in previous itera-
+ tions. For example, after
+
+ /(a|(b))+/
+@@ -6358,53 +6382,53 @@ REPETITION
+
+ ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS
+
+- With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy")
+- repetition, failure of what follows normally causes the repeated item
+- to be re-evaluated to see if a different number of repeats allows the
+- rest of the pattern to match. Sometimes it is useful to prevent this,
+- either to change the nature of the match, or to cause it fail earlier
+- than it otherwise might, when the author of the pattern knows there is
++ With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy")
++ repetition, failure of what follows normally causes the repeated item
++ to be re-evaluated to see if a different number of repeats allows the
++ rest of the pattern to match. Sometimes it is useful to prevent this,
++ either to change the nature of the match, or to cause it fail earlier
++ than it otherwise might, when the author of the pattern knows there is
+ no point in carrying on.
+
+- Consider, for example, the pattern \d+foo when applied to the subject
++ Consider, for example, the pattern \d+foo when applied to the subject
+ line
+
+ 123456bar
+
+ After matching all 6 digits and then failing to match "foo", the normal
+- action of the matcher is to try again with only 5 digits matching the
+- \d+ item, and then with 4, and so on, before ultimately failing.
+- "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides
+- the means for specifying that once a subpattern has matched, it is not
++ action of the matcher is to try again with only 5 digits matching the
++ \d+ item, and then with 4, and so on, before ultimately failing.
++ "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides
++ the means for specifying that once a subpattern has matched, it is not
+ to be re-evaluated in this way.
+
+- If we use atomic grouping for the previous example, the matcher gives
+- up immediately on failing to match "foo" the first time. The notation
++ If we use atomic grouping for the previous example, the matcher gives
++ up immediately on failing to match "foo" the first time. The notation
+ is a kind of special parenthesis, starting with (?> as in this example:
+
+ (?>\d+)foo
+
+- This kind of parenthesis "locks up" the part of the pattern it con-
+- tains once it has matched, and a failure further into the pattern is
+- prevented from backtracking into it. Backtracking past it to previous
++ This kind of parenthesis "locks up" the part of the pattern it con-
++ tains once it has matched, and a failure further into the pattern is
++ prevented from backtracking into it. Backtracking past it to previous
+ items, however, works as normal.
+
+- An alternative description is that a subpattern of this type matches
+- the string of characters that an identical standalone pattern would
++ An alternative description is that a subpattern of this type matches
++ the string of characters that an identical standalone pattern would
+ match, if anchored at the current point in the subject string.
+
+ Atomic grouping subpatterns are not capturing subpatterns. Simple cases
+ such as the above example can be thought of as a maximizing repeat that
+- must swallow everything it can. So, while both \d+ and \d+? are pre-
+- pared to adjust the number of digits they match in order to make the
++ must swallow everything it can. So, while both \d+ and \d+? are pre-
++ pared to adjust the number of digits they match in order to make the
+ rest of the pattern match, (?>\d+) can only match an entire sequence of
+ digits.
+
+- Atomic groups in general can of course contain arbitrarily complicated
+- subpatterns, and can be nested. However, when the subpattern for an
++ Atomic groups in general can of course contain arbitrarily complicated
++ subpatterns, and can be nested. However, when the subpattern for an
+ atomic group is just a single repeated item, as in the example above, a
+- simpler notation, called a "possessive quantifier" can be used. This
+- consists of an additional + character following a quantifier. Using
++ simpler notation, called a "possessive quantifier" can be used. This
++ consists of an additional + character following a quantifier. Using
+ this notation, the previous example can be rewritten as
+
+ \d++foo
+@@ -6414,45 +6438,45 @@ ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS
+
+ (abc|xyz){2,3}+
+
+- Possessive quantifiers are always greedy; the setting of the
++ Possessive quantifiers are always greedy; the setting of the
+ PCRE_UNGREEDY option is ignored. They are a convenient notation for the
+- simpler forms of atomic group. However, there is no difference in the
+- meaning of a possessive quantifier and the equivalent atomic group,
+- though there may be a performance difference; possessive quantifiers
++ simpler forms of atomic group. However, there is no difference in the
++ meaning of a possessive quantifier and the equivalent atomic group,
++ though there may be a performance difference; possessive quantifiers
+ should be slightly faster.
+
+- The possessive quantifier syntax is an extension to the Perl 5.8 syn-
+- tax. Jeffrey Friedl originated the idea (and the name) in the first
++ The possessive quantifier syntax is an extension to the Perl 5.8 syn-
++ tax. Jeffrey Friedl originated the idea (and the name) in the first
+ edition of his book. Mike McCloskey liked it, so implemented it when he
+- built Sun's Java package, and PCRE copied it from there. It ultimately
++ built Sun's Java package, and PCRE copied it from there. It ultimately
+ found its way into Perl at release 5.10.
+
+ PCRE has an optimization that automatically "possessifies" certain sim-
+- ple pattern constructs. For example, the sequence A+B is treated as
+- A++B because there is no point in backtracking into a sequence of A's
++ ple pattern constructs. For example, the sequence A+B is treated as
++ A++B because there is no point in backtracking into a sequence of A's
+ when B must follow.
+
+- When a pattern contains an unlimited repeat inside a subpattern that
+- can itself be repeated an unlimited number of times, the use of an
+- atomic group is the only way to avoid some failing matches taking a
++ When a pattern contains an unlimited repeat inside a subpattern that
++ can itself be repeated an unlimited number of times, the use of an
++ atomic group is the only way to avoid some failing matches taking a
+ very long time indeed. The pattern
+
+ (\D+|<\d+>)*[!?]
+
+- matches an unlimited number of substrings that either consist of non-
+- digits, or digits enclosed in <>, followed by either ! or ?. When it
++ matches an unlimited number of substrings that either consist of non-
++ digits, or digits enclosed in <>, followed by either ! or ?. When it
+ matches, it runs quickly. However, if it is applied to
+
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+
+- it takes a long time before reporting failure. This is because the
+- string can be divided between the internal \D+ repeat and the external
+- * repeat in a large number of ways, and all have to be tried. (The
+- example uses [!?] rather than a single character at the end, because
+- both PCRE and Perl have an optimization that allows for fast failure
+- when a single character is used. They remember the last single charac-
+- ter that is required for a match, and fail early if it is not present
+- in the string.) If the pattern is changed so that it uses an atomic
++ it takes a long time before reporting failure. This is because the
++ string can be divided between the internal \D+ repeat and the external
++ * repeat in a large number of ways, and all have to be tried. (The
++ example uses [!?] rather than a single character at the end, because
++ both PCRE and Perl have an optimization that allows for fast failure
++ when a single character is used. They remember the last single charac-
++ ter that is required for a match, and fail early if it is not present
++ in the string.) If the pattern is changed so that it uses an atomic
+ group, like this:
+
+ ((?>\D+)|<\d+>)*[!?]
+@@ -6464,28 +6488,28 @@ BACK REFERENCES
+
+ Outside a character class, a backslash followed by a digit greater than
+ 0 (and possibly further digits) is a back reference to a capturing sub-
+- pattern earlier (that is, to its left) in the pattern, provided there
++ pattern earlier (that is, to its left) in the pattern, provided there
+ have been that many previous capturing left parentheses.
+
+ However, if the decimal number following the backslash is less than 10,
+- it is always taken as a back reference, and causes an error only if
+- there are not that many capturing left parentheses in the entire pat-
+- tern. In other words, the parentheses that are referenced need not be
+- to the left of the reference for numbers less than 10. A "forward back
+- reference" of this type can make sense when a repetition is involved
+- and the subpattern to the right has participated in an earlier itera-
++ it is always taken as a back reference, and causes an error only if
++ there are not that many capturing left parentheses in the entire pat-
++ tern. In other words, the parentheses that are referenced need not be
++ to the left of the reference for numbers less than 10. A "forward back
++ reference" of this type can make sense when a repetition is involved
++ and the subpattern to the right has participated in an earlier itera-
+ tion.
+
+- It is not possible to have a numerical "forward back reference" to a
+- subpattern whose number is 10 or more using this syntax because a
+- sequence such as \50 is interpreted as a character defined in octal.
++ It is not possible to have a numerical "forward back reference" to a
++ subpattern whose number is 10 or more using this syntax because a
++ sequence such as \50 is interpreted as a character defined in octal.
+ See the subsection entitled "Non-printing characters" above for further
+- details of the handling of digits following a backslash. There is no
+- such problem when named parentheses are used. A back reference to any
++ details of the handling of digits following a backslash. There is no
++ such problem when named parentheses are used. A back reference to any
+ subpattern is possible using named parentheses (see below).
+
+- Another way of avoiding the ambiguity inherent in the use of digits
+- following a backslash is to use the \g escape sequence. This escape
++ Another way of avoiding the ambiguity inherent in the use of digits
++ following a backslash is to use the \g escape sequence. This escape
+ must be followed by an unsigned number or a negative number, optionally
+ enclosed in braces. These examples are all identical:
+
+@@ -6493,7 +6517,7 @@ BACK REFERENCES
+ (ring), \g1
+ (ring), \g{1}
+
+- An unsigned number specifies an absolute reference without the ambigu-
++ An unsigned number specifies an absolute reference without the ambigu-
+ ity that is present in the older syntax. It is also useful when literal
+ digits follow the reference. A negative number is a relative reference.
+ Consider this example:
+@@ -6502,33 +6526,33 @@ BACK REFERENCES
+
+ The sequence \g{-1} is a reference to the most recently started captur-
+ ing subpattern before \g, that is, is it equivalent to \2 in this exam-
+- ple. Similarly, \g{-2} would be equivalent to \1. The use of relative
+- references can be helpful in long patterns, and also in patterns that
+- are created by joining together fragments that contain references
++ ple. Similarly, \g{-2} would be equivalent to \1. The use of relative
++ references can be helpful in long patterns, and also in patterns that
++ are created by joining together fragments that contain references
+ within themselves.
+
+- A back reference matches whatever actually matched the capturing sub-
+- pattern in the current subject string, rather than anything matching
++ A back reference matches whatever actually matched the capturing sub-
++ pattern in the current subject string, rather than anything matching
+ the subpattern itself (see "Subpatterns as subroutines" below for a way
+ of doing that). So the pattern
+
+ (sens|respons)e and \1ibility
+
+- matches "sense and sensibility" and "response and responsibility", but
+- not "sense and responsibility". If caseful matching is in force at the
+- time of the back reference, the case of letters is relevant. For exam-
++ matches "sense and sensibility" and "response and responsibility", but
++ not "sense and responsibility". If caseful matching is in force at the
++ time of the back reference, the case of letters is relevant. For exam-
+ ple,
+
+ ((?i)rah)\s+\1
+
+- matches "rah rah" and "RAH RAH", but not "RAH rah", even though the
++ matches "rah rah" and "RAH RAH", but not "RAH rah", even though the
+ original capturing subpattern is matched caselessly.
+
+- There are several different ways of writing back references to named
+- subpatterns. The .NET syntax \k{name} and the Perl syntax \k<name> or
+- \k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's
++ There are several different ways of writing back references to named
++ subpatterns. The .NET syntax \k{name} and the Perl syntax \k<name> or
++ \k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's
+ unified back reference syntax, in which \g can be used for both numeric
+- and named references, is also supported. We could rewrite the above
++ and named references, is also supported. We could rewrite the above
+ example in any of the following ways:
+
+ (?<p1>(?i)rah)\s+\k<p1>
+@@ -6536,84 +6560,84 @@ BACK REFERENCES
+ (?P<p1>(?i)rah)\s+(?P=p1)
+ (?<p1>(?i)rah)\s+\g{p1}
+
+- A subpattern that is referenced by name may appear in the pattern
++ A subpattern that is referenced by name may appear in the pattern
+ before or after the reference.
+
+- There may be more than one back reference to the same subpattern. If a
+- subpattern has not actually been used in a particular match, any back
++ There may be more than one back reference to the same subpattern. If a
++ subpattern has not actually been used in a particular match, any back
+ references to it always fail by default. For example, the pattern
+
+ (a|(bc))\2
+
+- always fails if it starts to match "a" rather than "bc". However, if
++ always fails if it starts to match "a" rather than "bc". However, if
+ the PCRE_JAVASCRIPT_COMPAT option is set at compile time, a back refer-
+ ence to an unset value matches an empty string.
+
+- Because there may be many capturing parentheses in a pattern, all dig-
+- its following a backslash are taken as part of a potential back refer-
+- ence number. If the pattern continues with a digit character, some
+- delimiter must be used to terminate the back reference. If the
+- PCRE_EXTENDED option is set, this can be white space. Otherwise, the
++ Because there may be many capturing parentheses in a pattern, all dig-
++ its following a backslash are taken as part of a potential back refer-
++ ence number. If the pattern continues with a digit character, some
++ delimiter must be used to terminate the back reference. If the
++ PCRE_EXTENDED option is set, this can be white space. Otherwise, the
+ \g{ syntax or an empty comment (see "Comments" below) can be used.
+
+ Recursive back references
+
+- A back reference that occurs inside the parentheses to which it refers
+- fails when the subpattern is first used, so, for example, (a\1) never
+- matches. However, such references can be useful inside repeated sub-
++ A back reference that occurs inside the parentheses to which it refers
++ fails when the subpattern is first used, so, for example, (a\1) never
++ matches. However, such references can be useful inside repeated sub-
+ patterns. For example, the pattern
+
+ (a|b\1)+
+
+ matches any number of "a"s and also "aba", "ababbaa" etc. At each iter-
+- ation of the subpattern, the back reference matches the character
+- string corresponding to the previous iteration. In order for this to
+- work, the pattern must be such that the first iteration does not need
+- to match the back reference. This can be done using alternation, as in
++ ation of the subpattern, the back reference matches the character
++ string corresponding to the previous iteration. In order for this to
++ work, the pattern must be such that the first iteration does not need
++ to match the back reference. This can be done using alternation, as in
+ the example above, or by a quantifier with a minimum of zero.
+
+- Back references of this type cause the group that they reference to be
+- treated as an atomic group. Once the whole group has been matched, a
+- subsequent matching failure cannot cause backtracking into the middle
++ Back references of this type cause the group that they reference to be
++ treated as an atomic group. Once the whole group has been matched, a
++ subsequent matching failure cannot cause backtracking into the middle
+ of the group.
+
+
+ ASSERTIONS
+
+- An assertion is a test on the characters following or preceding the
+- current matching point that does not actually consume any characters.
+- The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are
++ An assertion is a test on the characters following or preceding the
++ current matching point that does not actually consume any characters.
++ The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are
+ described above.
+
+- More complicated assertions are coded as subpatterns. There are two
+- kinds: those that look ahead of the current position in the subject
+- string, and those that look behind it. An assertion subpattern is
+- matched in the normal way, except that it does not cause the current
++ More complicated assertions are coded as subpatterns. There are two
++ kinds: those that look ahead of the current position in the subject
++ string, and those that look behind it. An assertion subpattern is
++ matched in the normal way, except that it does not cause the current
+ matching position to be changed.
+
+- Assertion subpatterns are not capturing subpatterns. If such an asser-
+- tion contains capturing subpatterns within it, these are counted for
+- the purposes of numbering the capturing subpatterns in the whole pat-
+- tern. However, substring capturing is carried out only for positive
++ Assertion subpatterns are not capturing subpatterns. If such an asser-
++ tion contains capturing subpatterns within it, these are counted for
++ the purposes of numbering the capturing subpatterns in the whole pat-
++ tern. However, substring capturing is carried out only for positive
+ assertions. (Perl sometimes, but not always, does do capturing in nega-
+ tive assertions.)
+
+- For compatibility with Perl, assertion subpatterns may be repeated;
+- though it makes no sense to assert the same thing several times, the
+- side effect of capturing parentheses may occasionally be useful. In
++ For compatibility with Perl, assertion subpatterns may be repeated;
++ though it makes no sense to assert the same thing several times, the
++ side effect of capturing parentheses may occasionally be useful. In
+ practice, there only three cases:
+
+- (1) If the quantifier is {0}, the assertion is never obeyed during
+- matching. However, it may contain internal capturing parenthesized
++ (1) If the quantifier is {0}, the assertion is never obeyed during
++ matching. However, it may contain internal capturing parenthesized
+ groups that are called from elsewhere via the subroutine mechanism.
+
+- (2) If quantifier is {0,n} where n is greater than zero, it is treated
+- as if it were {0,1}. At run time, the rest of the pattern match is
++ (2) If quantifier is {0,n} where n is greater than zero, it is treated
++ as if it were {0,1}. At run time, the rest of the pattern match is
+ tried with and without the assertion, the order depending on the greed-
+ iness of the quantifier.
+
+- (3) If the minimum repetition is greater than zero, the quantifier is
+- ignored. The assertion is obeyed just once when encountered during
++ (3) If the minimum repetition is greater than zero, the quantifier is
++ ignored. The assertion is obeyed just once when encountered during
+ matching.
+
+ Lookahead assertions
+@@ -6623,38 +6647,38 @@ ASSERTIONS
+
+ \w+(?=;)
+
+- matches a word followed by a semicolon, but does not include the semi-
++ matches a word followed by a semicolon, but does not include the semi-
+ colon in the match, and
+
+ foo(?!bar)
+
+- matches any occurrence of "foo" that is not followed by "bar". Note
++ matches any occurrence of "foo" that is not followed by "bar". Note
+ that the apparently similar pattern
+
+ (?!foo)bar
+
+- does not find an occurrence of "bar" that is preceded by something
+- other than "foo"; it finds any occurrence of "bar" whatsoever, because
++ does not find an occurrence of "bar" that is preceded by something
++ other than "foo"; it finds any occurrence of "bar" whatsoever, because
+ the assertion (?!foo) is always true when the next three characters are
+ "bar". A lookbehind assertion is needed to achieve the other effect.
+
+ If you want to force a matching failure at some point in a pattern, the
+- most convenient way to do it is with (?!) because an empty string
+- always matches, so an assertion that requires there not to be an empty
++ most convenient way to do it is with (?!) because an empty string
++ always matches, so an assertion that requires there not to be an empty
+ string must always fail. The backtracking control verb (*FAIL) or (*F)
+ is a synonym for (?!).
+
+ Lookbehind assertions
+
+- Lookbehind assertions start with (?<= for positive assertions and (?<!
++ Lookbehind assertions start with (?<= for positive assertions and (?<!
+ for negative assertions. For example,
+
+ (?<!foo)bar
+
+- does find an occurrence of "bar" that is not preceded by "foo". The
+- contents of a lookbehind assertion are restricted such that all the
++ does find an occurrence of "bar" that is not preceded by "foo". The
++ contents of a lookbehind assertion are restricted such that all the
+ strings it matches must have a fixed length. However, if there are sev-
+- eral top-level alternatives, they do not all have to have the same
++ eral top-level alternatives, they do not all have to have the same
+ fixed length. Thus
+
+ (?<=bullock|donkey)
+@@ -6663,62 +6687,62 @@ ASSERTIONS
+
+ (?<!dogs?|cats?)
+
+- causes an error at compile time. Branches that match different length
+- strings are permitted only at the top level of a lookbehind assertion.
++ causes an error at compile time. Branches that match different length
++ strings are permitted only at the top level of a lookbehind assertion.
+ This is an extension compared with Perl, which requires all branches to
+ match the same length of string. An assertion such as
+
+ (?<=ab(c|de))
+
+- is not permitted, because its single top-level branch can match two
++ is not permitted, because its single top-level branch can match two
+ different lengths, but it is acceptable to PCRE if rewritten to use two
+ top-level branches:
+
+ (?<=abc|abde)
+
+- In some cases, the escape sequence \K (see above) can be used instead
++ In some cases, the escape sequence \K (see above) can be used instead
+ of a lookbehind assertion to get round the fixed-length restriction.
+
+- The implementation of lookbehind assertions is, for each alternative,
+- to temporarily move the current position back by the fixed length and
++ The implementation of lookbehind assertions is, for each alternative,
++ to temporarily move the current position back by the fixed length and
+ then try to match. If there are insufficient characters before the cur-
+ rent position, the assertion fails.
+
+- In a UTF mode, PCRE does not allow the \C escape (which matches a sin-
+- gle data unit even in a UTF mode) to appear in lookbehind assertions,
+- because it makes it impossible to calculate the length of the lookbe-
+- hind. The \X and \R escapes, which can match different numbers of data
++ In a UTF mode, PCRE does not allow the \C escape (which matches a sin-
++ gle data unit even in a UTF mode) to appear in lookbehind assertions,
++ because it makes it impossible to calculate the length of the lookbe-
++ hind. The \X and \R escapes, which can match different numbers of data
+ units, are also not permitted.
+
+- "Subroutine" calls (see below) such as (?2) or (?&X) are permitted in
+- lookbehinds, as long as the subpattern matches a fixed-length string.
++ "Subroutine" calls (see below) such as (?2) or (?&X) are permitted in
++ lookbehinds, as long as the subpattern matches a fixed-length string.
+ Recursion, however, is not supported.
+
+- Possessive quantifiers can be used in conjunction with lookbehind
++ Possessive quantifiers can be used in conjunction with lookbehind
+ assertions to specify efficient matching of fixed-length strings at the
+ end of subject strings. Consider a simple pattern such as
+
+ abcd$
+
+- when applied to a long string that does not match. Because matching
++ when applied to a long string that does not match. Because matching
+ proceeds from left to right, PCRE will look for each "a" in the subject
+- and then see if what follows matches the rest of the pattern. If the
++ and then see if what follows matches the rest of the pattern. If the
+ pattern is specified as
+
+ ^.*abcd$
+
+- the initial .* matches the entire string at first, but when this fails
++ the initial .* matches the entire string at first, but when this fails
+ (because there is no following "a"), it backtracks to match all but the
+- last character, then all but the last two characters, and so on. Once
+- again the search for "a" covers the entire string, from right to left,
++ last character, then all but the last two characters, and so on. Once
++ again the search for "a" covers the entire string, from right to left,
+ so we are no better off. However, if the pattern is written as
+
+ ^.*+(?<=abcd)
+
+- there can be no backtracking for the .*+ item; it can match only the
+- entire string. The subsequent lookbehind assertion does a single test
+- on the last four characters. If it fails, the match fails immediately.
+- For long strings, this approach makes a significant difference to the
++ there can be no backtracking for the .*+ item; it can match only the
++ entire string. The subsequent lookbehind assertion does a single test
++ on the last four characters. If it fails, the match fails immediately.
++ For long strings, this approach makes a significant difference to the
+ processing time.
+
+ Using multiple assertions
+@@ -6727,18 +6751,18 @@ ASSERTIONS
+
+ (?<=\d{3})(?<!999)foo
+
+- matches "foo" preceded by three digits that are not "999". Notice that
+- each of the assertions is applied independently at the same point in
+- the subject string. First there is a check that the previous three
+- characters are all digits, and then there is a check that the same
++ matches "foo" preceded by three digits that are not "999". Notice that
++ each of the assertions is applied independently at the same point in
++ the subject string. First there is a check that the previous three
++ characters are all digits, and then there is a check that the same
+ three characters are not "999". This pattern does not match "foo" pre-
+- ceded by six characters, the first of which are digits and the last
+- three of which are not "999". For example, it doesn't match "123abc-
++ ceded by six characters, the first of which are digits and the last
++ three of which are not "999". For example, it doesn't match "123abc-
+ foo". A pattern to do that is
+
+ (?<=\d{3}...)(?<!999)foo
+
+- This time the first assertion looks at the preceding six characters,
++ This time the first assertion looks at the preceding six characters,
+ checking that the first three are digits, and then the second assertion
+ checks that the preceding three characters are not "999".
+
+@@ -6746,29 +6770,29 @@ ASSERTIONS
+
+ (?<=(?<!foo)bar)baz
+
+- matches an occurrence of "baz" that is preceded by "bar" which in turn
++ matches an occurrence of "baz" that is preceded by "bar" which in turn
+ is not preceded by "foo", while
+
+ (?<=\d{3}(?!999)...)foo
+
+- is another pattern that matches "foo" preceded by three digits and any
++ is another pattern that matches "foo" preceded by three digits and any
+ three characters that are not "999".
+
+
+ CONDITIONAL SUBPATTERNS
+
+- It is possible to cause the matching process to obey a subpattern con-
+- ditionally or to choose between two alternative subpatterns, depending
+- on the result of an assertion, or whether a specific capturing subpat-
+- tern has already been matched. The two possible forms of conditional
++ It is possible to cause the matching process to obey a subpattern con-
++ ditionally or to choose between two alternative subpatterns, depending
++ on the result of an assertion, or whether a specific capturing subpat-
++ tern has already been matched. The two possible forms of conditional
+ subpattern are:
+
+ (?(condition)yes-pattern)
+ (?(condition)yes-pattern|no-pattern)
+
+- If the condition is satisfied, the yes-pattern is used; otherwise the
+- no-pattern (if present) is used. If there are more than two alterna-
+- tives in the subpattern, a compile-time error occurs. Each of the two
++ If the condition is satisfied, the yes-pattern is used; otherwise the
++ no-pattern (if present) is used. If there are more than two alterna-
++ tives in the subpattern, a compile-time error occurs. Each of the two
+ alternatives may itself contain nested subpatterns of any form, includ-
+ ing conditional subpatterns; the restriction to two alternatives
+ applies only at the level of the condition. This pattern fragment is an
+@@ -6777,68 +6801,68 @@ CONDITIONAL SUBPATTERNS
+ (?(1) (A|B|C) | (D | (?(2)E|F) | E) )
+
+
+- There are four kinds of condition: references to subpatterns, refer-
++ There are four kinds of condition: references to subpatterns, refer-
+ ences to recursion, a pseudo-condition called DEFINE, and assertions.
+
+ Checking for a used subpattern by number
+
+- If the text between the parentheses consists of a sequence of digits,
++ If the text between the parentheses consists of a sequence of digits,
+ the condition is true if a capturing subpattern of that number has pre-
+- viously matched. If there is more than one capturing subpattern with
+- the same number (see the earlier section about duplicate subpattern
+- numbers), the condition is true if any of them have matched. An alter-
+- native notation is to precede the digits with a plus or minus sign. In
+- this case, the subpattern number is relative rather than absolute. The
+- most recently opened parentheses can be referenced by (?(-1), the next
+- most recent by (?(-2), and so on. Inside loops it can also make sense
++ viously matched. If there is more than one capturing subpattern with
++ the same number (see the earlier section about duplicate subpattern
++ numbers), the condition is true if any of them have matched. An alter-
++ native notation is to precede the digits with a plus or minus sign. In
++ this case, the subpattern number is relative rather than absolute. The
++ most recently opened parentheses can be referenced by (?(-1), the next
++ most recent by (?(-2), and so on. Inside loops it can also make sense
+ to refer to subsequent groups. The next parentheses to be opened can be
+- referenced as (?(+1), and so on. (The value zero in any of these forms
++ referenced as (?(+1), and so on. (The value zero in any of these forms
+ is not used; it provokes a compile-time error.)
+
+- Consider the following pattern, which contains non-significant white
++ Consider the following pattern, which contains non-significant white
+ space to make it more readable (assume the PCRE_EXTENDED option) and to
+ divide it into three parts for ease of discussion:
+
+ ( \( )? [^()]+ (?(1) \) )
+
+- The first part matches an optional opening parenthesis, and if that
++ The first part matches an optional opening parenthesis, and if that
+ character is present, sets it as the first captured substring. The sec-
+- ond part matches one or more characters that are not parentheses. The
+- third part is a conditional subpattern that tests whether or not the
+- first set of parentheses matched. If they did, that is, if subject
+- started with an opening parenthesis, the condition is true, and so the
+- yes-pattern is executed and a closing parenthesis is required. Other-
+- wise, since no-pattern is not present, the subpattern matches nothing.
+- In other words, this pattern matches a sequence of non-parentheses,
++ ond part matches one or more characters that are not parentheses. The
++ third part is a conditional subpattern that tests whether or not the
++ first set of parentheses matched. If they did, that is, if subject
++ started with an opening parenthesis, the condition is true, and so the
++ yes-pattern is executed and a closing parenthesis is required. Other-
++ wise, since no-pattern is not present, the subpattern matches nothing.
++ In other words, this pattern matches a sequence of non-parentheses,
+ optionally enclosed in parentheses.
+
+- If you were embedding this pattern in a larger one, you could use a
++ If you were embedding this pattern in a larger one, you could use a
+ relative reference:
+
+ ...other stuff... ( \( )? [^()]+ (?(-1) \) ) ...
+
+- This makes the fragment independent of the parentheses in the larger
++ This makes the fragment independent of the parentheses in the larger
+ pattern.
+
+ Checking for a used subpattern by name
+
+- Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a
+- used subpattern by name. For compatibility with earlier versions of
+- PCRE, which had this facility before Perl, the syntax (?(name)...) is
++ Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a
++ used subpattern by name. For compatibility with earlier versions of
++ PCRE, which had this facility before Perl, the syntax (?(name)...) is
+ also recognized.
+
+ Rewriting the above example to use a named subpattern gives this:
+
+ (?<OPEN> \( )? [^()]+ (?(<OPEN>) \) )
+
+- If the name used in a condition of this kind is a duplicate, the test
+- is applied to all subpatterns of the same name, and is true if any one
++ If the name used in a condition of this kind is a duplicate, the test
++ is applied to all subpatterns of the same name, and is true if any one
+ of them has matched.
+
+ Checking for pattern recursion
+
+ If the condition is the string (R), and there is no subpattern with the
+- name R, the condition is true if a recursive call to the whole pattern
++ name R, the condition is true if a recursive call to the whole pattern
+ or any subpattern has been made. If digits or a name preceded by amper-
+ sand follow the letter R, for example:
+
+@@ -6846,51 +6870,51 @@ CONDITIONAL SUBPATTERNS
+
+ the condition is true if the most recent recursion is into a subpattern
+ whose number or name is given. This condition does not check the entire
+- recursion stack. If the name used in a condition of this kind is a
++ recursion stack. If the name used in a condition of this kind is a
+ duplicate, the test is applied to all subpatterns of the same name, and
+ is true if any one of them is the most recent recursion.
+
+- At "top level", all these recursion test conditions are false. The
++ At "top level", all these recursion test conditions are false. The
+ syntax for recursive patterns is described below.
+
+ Defining subpatterns for use by reference only
+
+- If the condition is the string (DEFINE), and there is no subpattern
+- with the name DEFINE, the condition is always false. In this case,
+- there may be only one alternative in the subpattern. It is always
+- skipped if control reaches this point in the pattern; the idea of
+- DEFINE is that it can be used to define subroutines that can be refer-
+- enced from elsewhere. (The use of subroutines is described below.) For
+- example, a pattern to match an IPv4 address such as "192.168.23.245"
++ If the condition is the string (DEFINE), and there is no subpattern
++ with the name DEFINE, the condition is always false. In this case,
++ there may be only one alternative in the subpattern. It is always
++ skipped if control reaches this point in the pattern; the idea of
++ DEFINE is that it can be used to define subroutines that can be refer-
++ enced from elsewhere. (The use of subroutines is described below.) For
++ example, a pattern to match an IPv4 address such as "192.168.23.245"
+ could be written like this (ignore white space and line breaks):
+
+ (?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
+ \b (?&byte) (\.(?&byte)){3} \b
+
+- The first part of the pattern is a DEFINE group inside which a another
+- group named "byte" is defined. This matches an individual component of
+- an IPv4 address (a number less than 256). When matching takes place,
+- this part of the pattern is skipped because DEFINE acts like a false
+- condition. The rest of the pattern uses references to the named group
+- to match the four dot-separated components of an IPv4 address, insist-
++ The first part of the pattern is a DEFINE group inside which a another
++ group named "byte" is defined. This matches an individual component of
++ an IPv4 address (a number less than 256). When matching takes place,
++ this part of the pattern is skipped because DEFINE acts like a false
++ condition. The rest of the pattern uses references to the named group
++ to match the four dot-separated components of an IPv4 address, insist-
+ ing on a word boundary at each end.
+
+ Assertion conditions
+
+- If the condition is not in any of the above formats, it must be an
+- assertion. This may be a positive or negative lookahead or lookbehind
+- assertion. Consider this pattern, again containing non-significant
++ If the condition is not in any of the above formats, it must be an
++ assertion. This may be a positive or negative lookahead or lookbehind
++ assertion. Consider this pattern, again containing non-significant
+ white space, and with the two alternatives on the second line:
+
+ (?(?=[^a-z]*[a-z])
+ \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
+
+- The condition is a positive lookahead assertion that matches an
+- optional sequence of non-letters followed by a letter. In other words,
+- it tests for the presence of at least one letter in the subject. If a
+- letter is found, the subject is matched against the first alternative;
+- otherwise it is matched against the second. This pattern matches
+- strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are
++ The condition is a positive lookahead assertion that matches an
++ optional sequence of non-letters followed by a letter. In other words,
++ it tests for the presence of at least one letter in the subject. If a
++ letter is found, the subject is matched against the first alternative;
++ otherwise it is matched against the second. This pattern matches
++ strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are
+ letters and dd are digits.
+
+
+@@ -6899,41 +6923,41 @@ COMMENTS
+ There are two ways of including comments in patterns that are processed
+ by PCRE. In both cases, the start of the comment must not be in a char-
+ acter class, nor in the middle of any other sequence of related charac-
+- ters such as (?: or a subpattern name or number. The characters that
++ ters such as (?: or a subpattern name or number. The characters that
+ make up a comment play no part in the pattern matching.
+
+- The sequence (?# marks the start of a comment that continues up to the
+- next closing parenthesis. Nested parentheses are not permitted. If the
++ The sequence (?# marks the start of a comment that continues up to the
++ next closing parenthesis. Nested parentheses are not permitted. If the
+ PCRE_EXTENDED option is set, an unescaped # character also introduces a
+- comment, which in this case continues to immediately after the next
+- newline character or character sequence in the pattern. Which charac-
++ comment, which in this case continues to immediately after the next
++ newline character or character sequence in the pattern. Which charac-
+ ters are interpreted as newlines is controlled by the options passed to
+- a compiling function or by a special sequence at the start of the pat-
++ a compiling function or by a special sequence at the start of the pat-
+ tern, as described in the section entitled "Newline conventions" above.
+ Note that the end of this type of comment is a literal newline sequence
+- in the pattern; escape sequences that happen to represent a newline do
+- not count. For example, consider this pattern when PCRE_EXTENDED is
++ in the pattern; escape sequences that happen to represent a newline do
++ not count. For example, consider this pattern when PCRE_EXTENDED is
+ set, and the default newline convention is in force:
+
+ abc #comment \n still comment
+
+- On encountering the # character, pcre_compile() skips along, looking
+- for a newline in the pattern. The sequence \n is still literal at this
+- stage, so it does not terminate the comment. Only an actual character
++ On encountering the # character, pcre_compile() skips along, looking
++ for a newline in the pattern. The sequence \n is still literal at this
++ stage, so it does not terminate the comment. Only an actual character
+ with the code value 0x0a (the default newline) does so.
+
+
+ RECURSIVE PATTERNS
+
+- Consider the problem of matching a string in parentheses, allowing for
+- unlimited nested parentheses. Without the use of recursion, the best
+- that can be done is to use a pattern that matches up to some fixed
+- depth of nesting. It is not possible to handle an arbitrary nesting
++ Consider the problem of matching a string in parentheses, allowing for
++ unlimited nested parentheses. Without the use of recursion, the best
++ that can be done is to use a pattern that matches up to some fixed
++ depth of nesting. It is not possible to handle an arbitrary nesting
+ depth.
+
+ For some time, Perl has provided a facility that allows regular expres-
+- sions to recurse (amongst other things). It does this by interpolating
+- Perl code in the expression at run time, and the code can refer to the
++ sions to recurse (amongst other things). It does this by interpolating
++ Perl code in the expression at run time, and the code can refer to the
+ expression itself. A Perl pattern using code interpolation to solve the
+ parentheses problem can be created like this:
+
+@@ -6943,201 +6967,201 @@ RECURSIVE PATTERNS
+ refers recursively to the pattern in which it appears.
+
+ Obviously, PCRE cannot support the interpolation of Perl code. Instead,
+- it supports special syntax for recursion of the entire pattern, and
+- also for individual subpattern recursion. After its introduction in
+- PCRE and Python, this kind of recursion was subsequently introduced
++ it supports special syntax for recursion of the entire pattern, and
++ also for individual subpattern recursion. After its introduction in
++ PCRE and Python, this kind of recursion was subsequently introduced
+ into Perl at release 5.10.
+
+- A special item that consists of (? followed by a number greater than
+- zero and a closing parenthesis is a recursive subroutine call of the
+- subpattern of the given number, provided that it occurs inside that
+- subpattern. (If not, it is a non-recursive subroutine call, which is
+- described in the next section.) The special item (?R) or (?0) is a
++ A special item that consists of (? followed by a number greater than
++ zero and a closing parenthesis is a recursive subroutine call of the
++ subpattern of the given number, provided that it occurs inside that
++ subpattern. (If not, it is a non-recursive subroutine call, which is
++ described in the next section.) The special item (?R) or (?0) is a
+ recursive call of the entire regular expression.
+
+- This PCRE pattern solves the nested parentheses problem (assume the
++ This PCRE pattern solves the nested parentheses problem (assume the
+ PCRE_EXTENDED option is set so that white space is ignored):
+
+ \( ( [^()]++ | (?R) )* \)
+
+- First it matches an opening parenthesis. Then it matches any number of
+- substrings which can either be a sequence of non-parentheses, or a
+- recursive match of the pattern itself (that is, a correctly parenthe-
++ First it matches an opening parenthesis. Then it matches any number of
++ substrings which can either be a sequence of non-parentheses, or a
++ recursive match of the pattern itself (that is, a correctly parenthe-
+ sized substring). Finally there is a closing parenthesis. Note the use
+ of a possessive quantifier to avoid backtracking into sequences of non-
+ parentheses.
+
+- If this were part of a larger pattern, you would not want to recurse
++ If this were part of a larger pattern, you would not want to recurse
+ the entire pattern, so instead you could use this:
+
+ ( \( ( [^()]++ | (?1) )* \) )
+
+- We have put the pattern into parentheses, and caused the recursion to
++ We have put the pattern into parentheses, and caused the recursion to
+ refer to them instead of the whole pattern.
+
+- In a larger pattern, keeping track of parenthesis numbers can be
+- tricky. This is made easier by the use of relative references. Instead
++ In a larger pattern, keeping track of parenthesis numbers can be
++ tricky. This is made easier by the use of relative references. Instead
+ of (?1) in the pattern above you can write (?-2) to refer to the second
+- most recently opened parentheses preceding the recursion. In other
+- words, a negative number counts capturing parentheses leftwards from
++ most recently opened parentheses preceding the recursion. In other
++ words, a negative number counts capturing parentheses leftwards from
+ the point at which it is encountered.
+
+- It is also possible to refer to subsequently opened parentheses, by
+- writing references such as (?+2). However, these cannot be recursive
+- because the reference is not inside the parentheses that are refer-
+- enced. They are always non-recursive subroutine calls, as described in
++ It is also possible to refer to subsequently opened parentheses, by
++ writing references such as (?+2). However, these cannot be recursive
++ because the reference is not inside the parentheses that are refer-
++ enced. They are always non-recursive subroutine calls, as described in
+ the next section.
+
+- An alternative approach is to use named parentheses instead. The Perl
+- syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also
++ An alternative approach is to use named parentheses instead. The Perl
++ syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also
+ supported. We could rewrite the above example as follows:
+
+ (?<pn> \( ( [^()]++ | (?&pn) )* \) )
+
+- If there is more than one subpattern with the same name, the earliest
++ If there is more than one subpattern with the same name, the earliest
+ one is used.
+
+- This particular example pattern that we have been looking at contains
++ This particular example pattern that we have been looking at contains
+ nested unlimited repeats, and so the use of a possessive quantifier for
+ matching strings of non-parentheses is important when applying the pat-
+- tern to strings that do not match. For example, when this pattern is
++ tern to strings that do not match. For example, when this pattern is
+ applied to
+
+ (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
+
+- it yields "no match" quickly. However, if a possessive quantifier is
+- not used, the match runs for a very long time indeed because there are
+- so many different ways the + and * repeats can carve up the subject,
++ it yields "no match" quickly. However, if a possessive quantifier is
++ not used, the match runs for a very long time indeed because there are
++ so many different ways the + and * repeats can carve up the subject,
+ and all have to be tested before failure can be reported.
+
+- At the end of a match, the values of capturing parentheses are those
+- from the outermost level. If you want to obtain intermediate values, a
+- callout function can be used (see below and the pcrecallout documenta-
++ At the end of a match, the values of capturing parentheses are those
++ from the outermost level. If you want to obtain intermediate values, a
++ callout function can be used (see below and the pcrecallout documenta-
+ tion). If the pattern above is matched against
+
+ (ab(cd)ef)
+
+- the value for the inner capturing parentheses (numbered 2) is "ef",
+- which is the last value taken on at the top level. If a capturing sub-
+- pattern is not matched at the top level, its final captured value is
+- unset, even if it was (temporarily) set at a deeper level during the
++ the value for the inner capturing parentheses (numbered 2) is "ef",
++ which is the last value taken on at the top level. If a capturing sub-
++ pattern is not matched at the top level, its final captured value is
++ unset, even if it was (temporarily) set at a deeper level during the
+ matching process.
+
+- If there are more than 15 capturing parentheses in a pattern, PCRE has
+- to obtain extra memory to store data during a recursion, which it does
++ If there are more than 15 capturing parentheses in a pattern, PCRE has
++ to obtain extra memory to store data during a recursion, which it does
+ by using pcre_malloc, freeing it via pcre_free afterwards. If no memory
+ can be obtained, the match fails with the PCRE_ERROR_NOMEMORY error.
+
+- Do not confuse the (?R) item with the condition (R), which tests for
+- recursion. Consider this pattern, which matches text in angle brack-
+- ets, allowing for arbitrary nesting. Only digits are allowed in nested
+- brackets (that is, when recursing), whereas any characters are permit-
++ Do not confuse the (?R) item with the condition (R), which tests for
++ recursion. Consider this pattern, which matches text in angle brack-
++ ets, allowing for arbitrary nesting. Only digits are allowed in nested
++ brackets (that is, when recursing), whereas any characters are permit-
+ ted at the outer level.
+
+ < (?: (?(R) \d++ | [^<>]*+) | (?R)) * >
+
+- In this pattern, (?(R) is the start of a conditional subpattern, with
+- two different alternatives for the recursive and non-recursive cases.
++ In this pattern, (?(R) is the start of a conditional subpattern, with
++ two different alternatives for the recursive and non-recursive cases.
+ The (?R) item is the actual recursive call.
+
+ Differences in recursion processing between PCRE and Perl
+
+- Recursion processing in PCRE differs from Perl in two important ways.
+- In PCRE (like Python, but unlike Perl), a recursive subpattern call is
++ Recursion processing in PCRE differs from Perl in two important ways.
++ In PCRE (like Python, but unlike Perl), a recursive subpattern call is
+ always treated as an atomic group. That is, once it has matched some of
+ the subject string, it is never re-entered, even if it contains untried
+- alternatives and there is a subsequent matching failure. This can be
+- illustrated by the following pattern, which purports to match a palin-
+- dromic string that contains an odd number of characters (for example,
++ alternatives and there is a subsequent matching failure. This can be
++ illustrated by the following pattern, which purports to match a palin-
++ dromic string that contains an odd number of characters (for example,
+ "a", "aba", "abcba", "abcdcba"):
+
+ ^(.|(.)(?1)\2)$
+
+ The idea is that it either matches a single character, or two identical
+- characters surrounding a sub-palindrome. In Perl, this pattern works;
+- in PCRE it does not if the pattern is longer than three characters.
++ characters surrounding a sub-palindrome. In Perl, this pattern works;
++ in PCRE it does not if the pattern is longer than three characters.
+ Consider the subject string "abcba":
+
+- At the top level, the first character is matched, but as it is not at
++ At the top level, the first character is matched, but as it is not at
+ the end of the string, the first alternative fails; the second alterna-
+ tive is taken and the recursion kicks in. The recursive call to subpat-
+- tern 1 successfully matches the next character ("b"). (Note that the
++ tern 1 successfully matches the next character ("b"). (Note that the
+ beginning and end of line tests are not part of the recursion).
+
+- Back at the top level, the next character ("c") is compared with what
+- subpattern 2 matched, which was "a". This fails. Because the recursion
+- is treated as an atomic group, there are now no backtracking points,
+- and so the entire match fails. (Perl is able, at this point, to re-
+- enter the recursion and try the second alternative.) However, if the
++ Back at the top level, the next character ("c") is compared with what
++ subpattern 2 matched, which was "a". This fails. Because the recursion
++ is treated as an atomic group, there are now no backtracking points,
++ and so the entire match fails. (Perl is able, at this point, to re-
++ enter the recursion and try the second alternative.) However, if the
+ pattern is written with the alternatives in the other order, things are
+ different:
+
+ ^((.)(?1)\2|.)$
+
+- This time, the recursing alternative is tried first, and continues to
+- recurse until it runs out of characters, at which point the recursion
+- fails. But this time we do have another alternative to try at the
+- higher level. That is the big difference: in the previous case the
++ This time, the recursing alternative is tried first, and continues to
++ recurse until it runs out of characters, at which point the recursion
++ fails. But this time we do have another alternative to try at the
++ higher level. That is the big difference: in the previous case the
+ remaining alternative is at a deeper recursion level, which PCRE cannot
+ use.
+
+- To change the pattern so that it matches all palindromic strings, not
+- just those with an odd number of characters, it is tempting to change
++ To change the pattern so that it matches all palindromic strings, not
++ just those with an odd number of characters, it is tempting to change
+ the pattern to this:
+
+ ^((.)(?1)\2|.?)$
+
+- Again, this works in Perl, but not in PCRE, and for the same reason.
+- When a deeper recursion has matched a single character, it cannot be
+- entered again in order to match an empty string. The solution is to
+- separate the two cases, and write out the odd and even cases as alter-
++ Again, this works in Perl, but not in PCRE, and for the same reason.
++ When a deeper recursion has matched a single character, it cannot be
++ entered again in order to match an empty string. The solution is to
++ separate the two cases, and write out the odd and even cases as alter-
+ natives at the higher level:
+
+ ^(?:((.)(?1)\2|)|((.)(?3)\4|.))
+
+- If you want to match typical palindromic phrases, the pattern has to
++ If you want to match typical palindromic phrases, the pattern has to
+ ignore all non-word characters, which can be done like this:
+
+ ^\W*+(?:((.)\W*+(?1)\W*+\2|)|((.)\W*+(?3)\W*+\4|\W*+.\W*+))\W*+$
+
+ If run with the PCRE_CASELESS option, this pattern matches phrases such
+ as "A man, a plan, a canal: Panama!" and it works well in both PCRE and
+- Perl. Note the use of the possessive quantifier *+ to avoid backtrack-
+- ing into sequences of non-word characters. Without this, PCRE takes a
+- great deal longer (ten times or more) to match typical phrases, and
++ Perl. Note the use of the possessive quantifier *+ to avoid backtrack-
++ ing into sequences of non-word characters. Without this, PCRE takes a
++ great deal longer (ten times or more) to match typical phrases, and
+ Perl takes so long that you think it has gone into a loop.
+
+- WARNING: The palindrome-matching patterns above work only if the sub-
+- ject string does not start with a palindrome that is shorter than the
+- entire string. For example, although "abcba" is correctly matched, if
+- the subject is "ababa", PCRE finds the palindrome "aba" at the start,
+- then fails at top level because the end of the string does not follow.
+- Once again, it cannot jump back into the recursion to try other alter-
++ WARNING: The palindrome-matching patterns above work only if the sub-
++ ject string does not start with a palindrome that is shorter than the
++ entire string. For example, although "abcba" is correctly matched, if
++ the subject is "ababa", PCRE finds the palindrome "aba" at the start,
++ then fails at top level because the end of the string does not follow.
++ Once again, it cannot jump back into the recursion to try other alter-
+ natives, so the entire match fails.
+
+- The second way in which PCRE and Perl differ in their recursion pro-
+- cessing is in the handling of captured values. In Perl, when a subpat-
+- tern is called recursively or as a subpattern (see the next section),
+- it has no access to any values that were captured outside the recur-
+- sion, whereas in PCRE these values can be referenced. Consider this
++ The second way in which PCRE and Perl differ in their recursion pro-
++ cessing is in the handling of captured values. In Perl, when a subpat-
++ tern is called recursively or as a subpattern (see the next section),
++ it has no access to any values that were captured outside the recur-
++ sion, whereas in PCRE these values can be referenced. Consider this
+ pattern:
+
+ ^(.)(\1|a(?2))
+
+- In PCRE, this pattern matches "bab". The first capturing parentheses
+- match "b", then in the second group, when the back reference \1 fails
+- to match "b", the second alternative matches "a" and then recurses. In
+- the recursion, \1 does now match "b" and so the whole match succeeds.
+- In Perl, the pattern fails to match because inside the recursive call
++ In PCRE, this pattern matches "bab". The first capturing parentheses
++ match "b", then in the second group, when the back reference \1 fails
++ to match "b", the second alternative matches "a" and then recurses. In
++ the recursion, \1 does now match "b" and so the whole match succeeds.
++ In Perl, the pattern fails to match because inside the recursive call
+ \1 cannot access the externally set value.
+
+
+ SUBPATTERNS AS SUBROUTINES
+
+- If the syntax for a recursive subpattern call (either by number or by
+- name) is used outside the parentheses to which it refers, it operates
+- like a subroutine in a programming language. The called subpattern may
+- be defined before or after the reference. A numbered reference can be
++ If the syntax for a recursive subpattern call (either by number or by
++ name) is used outside the parentheses to which it refers, it operates
++ like a subroutine in a programming language. The called subpattern may
++ be defined before or after the reference. A numbered reference can be
+ absolute or relative, as in these examples:
+
+ (...(absolute)...)...(?2)...
+@@ -7148,79 +7172,79 @@ SUBPATTERNS AS SUBROUTINES
+
+ (sens|respons)e and \1ibility
+
+- matches "sense and sensibility" and "response and responsibility", but
++ matches "sense and sensibility" and "response and responsibility", but
+ not "sense and responsibility". If instead the pattern
+
+ (sens|respons)e and (?1)ibility
+
+- is used, it does match "sense and responsibility" as well as the other
+- two strings. Another example is given in the discussion of DEFINE
++ is used, it does match "sense and responsibility" as well as the other
++ two strings. Another example is given in the discussion of DEFINE
+ above.
+
+- All subroutine calls, whether recursive or not, are always treated as
+- atomic groups. That is, once a subroutine has matched some of the sub-
++ All subroutine calls, whether recursive or not, are always treated as
++ atomic groups. That is, once a subroutine has matched some of the sub-
+ ject string, it is never re-entered, even if it contains untried alter-
+- natives and there is a subsequent matching failure. Any capturing
+- parentheses that are set during the subroutine call revert to their
++ natives and there is a subsequent matching failure. Any capturing
++ parentheses that are set during the subroutine call revert to their
+ previous values afterwards.
+
+- Processing options such as case-independence are fixed when a subpat-
+- tern is defined, so if it is used as a subroutine, such options cannot
++ Processing options such as case-independence are fixed when a subpat-
++ tern is defined, so if it is used as a subroutine, such options cannot
+ be changed for different calls. For example, consider this pattern:
+
+ (abc)(?i:(?-1))
+
+- It matches "abcabc". It does not match "abcABC" because the change of
++ It matches "abcabc". It does not match "abcABC" because the change of
+ processing option does not affect the called subpattern.
+
+
+ ONIGURUMA SUBROUTINE SYNTAX
+
+- For compatibility with Oniguruma, the non-Perl syntax \g followed by a
++ For compatibility with Oniguruma, the non-Perl syntax \g followed by a
+ name or a number enclosed either in angle brackets or single quotes, is
+- an alternative syntax for referencing a subpattern as a subroutine,
+- possibly recursively. Here are two of the examples used above, rewrit-
++ an alternative syntax for referencing a subpattern as a subroutine,
++ possibly recursively. Here are two of the examples used above, rewrit-
+ ten using this syntax:
+
+ (?<pn> \( ( (?>[^()]+) | \g<pn> )* \) )
+ (sens|respons)e and \g'1'ibility
+
+- PCRE supports an extension to Oniguruma: if a number is preceded by a
++ PCRE supports an extension to Oniguruma: if a number is preceded by a
+ plus or a minus sign it is taken as a relative reference. For example:
+
+ (abc)(?i:\g<-1>)
+
+- Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
+- synonymous. The former is a back reference; the latter is a subroutine
++ Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
++ synonymous. The former is a back reference; the latter is a subroutine
+ call.
+
+
+ CALLOUTS
+
+ Perl has a feature whereby using the sequence (?{...}) causes arbitrary
+- Perl code to be obeyed in the middle of matching a regular expression.
++ Perl code to be obeyed in the middle of matching a regular expression.
+ This makes it possible, amongst other things, to extract different sub-
+ strings that match the same pair of parentheses when there is a repeti-
+ tion.
+
+ PCRE provides a similar feature, but of course it cannot obey arbitrary
+ Perl code. The feature is called "callout". The caller of PCRE provides
+- an external function by putting its entry point in the global variable
+- pcre_callout (8-bit library) or pcre[16|32]_callout (16-bit or 32-bit
+- library). By default, this variable contains NULL, which disables all
++ an external function by putting its entry point in the global variable
++ pcre_callout (8-bit library) or pcre[16|32]_callout (16-bit or 32-bit
++ library). By default, this variable contains NULL, which disables all
+ calling out.
+
+- Within a regular expression, (?C) indicates the points at which the
+- external function is to be called. If you want to identify different
+- callout points, you can put a number less than 256 after the letter C.
+- The default value is zero. For example, this pattern has two callout
++ Within a regular expression, (?C) indicates the points at which the
++ external function is to be called. If you want to identify different
++ callout points, you can put a number less than 256 after the letter C.
++ The default value is zero. For example, this pattern has two callout
+ points:
+
+ (?C1)abc(?C2)def
+
+- If the PCRE_AUTO_CALLOUT flag is passed to a compiling function, call-
+- outs are automatically installed before each item in the pattern. They
+- are all numbered 255. If there is a conditional group in the pattern
++ If the PCRE_AUTO_CALLOUT flag is passed to a compiling function, call-
++ outs are automatically installed before each item in the pattern. They
++ are all numbered 255. If there is a conditional group in the pattern
+ whose condition is an assertion, an additional callout is inserted just
+ before the condition. An explicit callout may also be set at this posi-
+ tion, as in this example:
+@@ -7230,120 +7254,120 @@ CALLOUTS
+ Note that this applies only to assertion conditions, not to other types
+ of condition.
+
+- During matching, when PCRE reaches a callout point, the external func-
+- tion is called. It is provided with the number of the callout, the
+- position in the pattern, and, optionally, one item of data originally
+- supplied by the caller of the matching function. The callout function
++ During matching, when PCRE reaches a callout point, the external func-
++ tion is called. It is provided with the number of the callout, the
++ position in the pattern, and, optionally, one item of data originally
++ supplied by the caller of the matching function. The callout function
+ may cause matching to proceed, to backtrack, or to fail altogether.
+
+- By default, PCRE implements a number of optimizations at compile time
+- and matching time, and one side-effect is that sometimes callouts are
+- skipped. If you need all possible callouts to happen, you need to set
+- options that disable the relevant optimizations. More details, and a
+- complete description of the interface to the callout function, are
++ By default, PCRE implements a number of optimizations at compile time
++ and matching time, and one side-effect is that sometimes callouts are
++ skipped. If you need all possible callouts to happen, you need to set
++ options that disable the relevant optimizations. More details, and a
++ complete description of the interface to the callout function, are
+ given in the pcrecallout documentation.
+
+
+ BACKTRACKING CONTROL
+
+- Perl 5.10 introduced a number of "Special Backtracking Control Verbs",
+- which are still described in the Perl documentation as "experimental
+- and subject to change or removal in a future version of Perl". It goes
+- on to say: "Their usage in production code should be noted to avoid
+- problems during upgrades." The same remarks apply to the PCRE features
++ Perl 5.10 introduced a number of "Special Backtracking Control Verbs",
++ which are still described in the Perl documentation as "experimental
++ and subject to change or removal in a future version of Perl". It goes
++ on to say: "Their usage in production code should be noted to avoid
++ problems during upgrades." The same remarks apply to the PCRE features
+ described in this section.
+
+- The new verbs make use of what was previously invalid syntax: an open-
++ The new verbs make use of what was previously invalid syntax: an open-
+ ing parenthesis followed by an asterisk. They are generally of the form
+- (*VERB) or (*VERB:NAME). Some may take either form, possibly behaving
+- differently depending on whether or not a name is present. A name is
++ (*VERB) or (*VERB:NAME). Some may take either form, possibly behaving
++ differently depending on whether or not a name is present. A name is
+ any sequence of characters that does not include a closing parenthesis.
+ The maximum length of name is 255 in the 8-bit library and 65535 in the
+- 16-bit and 32-bit libraries. If the name is empty, that is, if the
+- closing parenthesis immediately follows the colon, the effect is as if
+- the colon were not there. Any number of these verbs may occur in a
++ 16-bit and 32-bit libraries. If the name is empty, that is, if the
++ closing parenthesis immediately follows the colon, the effect is as if
++ the colon were not there. Any number of these verbs may occur in a
+ pattern.
+
+- Since these verbs are specifically related to backtracking, most of
+- them can be used only when the pattern is to be matched using one of
+- the traditional matching functions, because these use a backtracking
+- algorithm. With the exception of (*FAIL), which behaves like a failing
+- negative assertion, the backtracking control verbs cause an error if
++ Since these verbs are specifically related to backtracking, most of
++ them can be used only when the pattern is to be matched using one of
++ the traditional matching functions, because these use a backtracking
++ algorithm. With the exception of (*FAIL), which behaves like a failing
++ negative assertion, the backtracking control verbs cause an error if
+ encountered by a DFA matching function.
+
+- The behaviour of these verbs in repeated groups, assertions, and in
++ The behaviour of these verbs in repeated groups, assertions, and in
+ subpatterns called as subroutines (whether or not recursively) is docu-
+ mented below.
+
+ Optimizations that affect backtracking verbs
+
+- PCRE contains some optimizations that are used to speed up matching by
++ PCRE contains some optimizations that are used to speed up matching by
+ running some checks at the start of each match attempt. For example, it
+- may know the minimum length of matching subject, or that a particular
++ may know the minimum length of matching subject, or that a particular
+ character must be present. When one of these optimizations bypasses the
+- running of a match, any included backtracking verbs will not, of
++ running of a match, any included backtracking verbs will not, of
+ course, be processed. You can suppress the start-of-match optimizations
+- by setting the PCRE_NO_START_OPTIMIZE option when calling pcre_com-
++ by setting the PCRE_NO_START_OPTIMIZE option when calling pcre_com-
+ pile() or pcre_exec(), or by starting the pattern with (*NO_START_OPT).
+ There is more discussion of this option in the section entitled "Option
+ bits for pcre_exec()" in the pcreapi documentation.
+
+- Experiments with Perl suggest that it too has similar optimizations,
++ Experiments with Perl suggest that it too has similar optimizations,
+ sometimes leading to anomalous results.
+
+ Verbs that act immediately
+
+- The following verbs act as soon as they are encountered. They may not
++ The following verbs act as soon as they are encountered. They may not
+ be followed by a name.
+
+ (*ACCEPT)
+
+- This verb causes the match to end successfully, skipping the remainder
+- of the pattern. However, when it is inside a subpattern that is called
+- as a subroutine, only that subpattern is ended successfully. Matching
++ This verb causes the match to end successfully, skipping the remainder
++ of the pattern. However, when it is inside a subpattern that is called
++ as a subroutine, only that subpattern is ended successfully. Matching
+ then continues at the outer level. If (*ACCEPT) in triggered in a posi-
+- tive assertion, the assertion succeeds; in a negative assertion, the
++ tive assertion, the assertion succeeds; in a negative assertion, the
+ assertion fails.
+
+- If (*ACCEPT) is inside capturing parentheses, the data so far is cap-
++ If (*ACCEPT) is inside capturing parentheses, the data so far is cap-
+ tured. For example:
+
+ A((?:A|B(*ACCEPT)|C)D)
+
+- This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap-
++ This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap-
+ tured by the outer parentheses.
+
+ (*FAIL) or (*F)
+
+- This verb causes a matching failure, forcing backtracking to occur. It
+- is equivalent to (?!) but easier to read. The Perl documentation notes
+- that it is probably useful only when combined with (?{}) or (??{}).
+- Those are, of course, Perl features that are not present in PCRE. The
+- nearest equivalent is the callout feature, as for example in this pat-
++ This verb causes a matching failure, forcing backtracking to occur. It
++ is equivalent to (?!) but easier to read. The Perl documentation notes
++ that it is probably useful only when combined with (?{}) or (??{}).
++ Those are, of course, Perl features that are not present in PCRE. The
++ nearest equivalent is the callout feature, as for example in this pat-
+ tern:
+
+ a+(?C)(*FAIL)
+
+- A match with the string "aaaa" always fails, but the callout is taken
++ A match with the string "aaaa" always fails, but the callout is taken
+ before each backtrack happens (in this example, 10 times).
+
+ Recording which path was taken
+
+- There is one verb whose main purpose is to track how a match was
+- arrived at, though it also has a secondary use in conjunction with
++ There is one verb whose main purpose is to track how a match was
++ arrived at, though it also has a secondary use in conjunction with
+ advancing the match starting point (see (*SKIP) below).
+
+ (*MARK:NAME) or (*:NAME)
+
+- A name is always required with this verb. There may be as many
+- instances of (*MARK) as you like in a pattern, and their names do not
++ A name is always required with this verb. There may be as many
++ instances of (*MARK) as you like in a pattern, and their names do not
+ have to be unique.
+
+- When a match succeeds, the name of the last-encountered (*MARK:NAME),
+- (*PRUNE:NAME), or (*THEN:NAME) on the matching path is passed back to
+- the caller as described in the section entitled "Extra data for
+- pcre_exec()" in the pcreapi documentation. Here is an example of
+- pcretest output, where the /K modifier requests the retrieval and out-
++ When a match succeeds, the name of the last-encountered (*MARK:NAME),
++ (*PRUNE:NAME), or (*THEN:NAME) on the matching path is passed back to
++ the caller as described in the section entitled "Extra data for
++ pcre_exec()" in the pcreapi documentation. Here is an example of
++ pcretest output, where the /K modifier requests the retrieval and out-
+ putting of (*MARK) data:
+
+ re> /X(*MARK:A)Y|X(*MARK:B)Z/K
+@@ -7355,73 +7379,73 @@ BACKTRACKING CONTROL
+ MK: B
+
+ The (*MARK) name is tagged with "MK:" in this output, and in this exam-
+- ple it indicates which of the two alternatives matched. This is a more
+- efficient way of obtaining this information than putting each alterna-
++ ple it indicates which of the two alternatives matched. This is a more
++ efficient way of obtaining this information than putting each alterna-
+ tive in its own capturing parentheses.
+
+- If a verb with a name is encountered in a positive assertion that is
+- true, the name is recorded and passed back if it is the last-encoun-
++ If a verb with a name is encountered in a positive assertion that is
++ true, the name is recorded and passed back if it is the last-encoun-
+ tered. This does not happen for negative assertions or failing positive
+ assertions.
+
+- After a partial match or a failed match, the last encountered name in
++ After a partial match or a failed match, the last encountered name in
+ the entire match process is returned. For example:
+
+ re> /X(*MARK:A)Y|X(*MARK:B)Z/K
+ data> XP
+ No match, mark = B
+
+- Note that in this unanchored example the mark is retained from the
++ Note that in this unanchored example the mark is retained from the
+ match attempt that started at the letter "X" in the subject. Subsequent
+ match attempts starting at "P" and then with an empty string do not get
+ as far as the (*MARK) item, but nevertheless do not reset it.
+
+- If you are interested in (*MARK) values after failed matches, you
+- should probably set the PCRE_NO_START_OPTIMIZE option (see above) to
++ If you are interested in (*MARK) values after failed matches, you
++ should probably set the PCRE_NO_START_OPTIMIZE option (see above) to
+ ensure that the match is always attempted.
+
+ Verbs that act after backtracking
+
+ The following verbs do nothing when they are encountered. Matching con-
+- tinues with what follows, but if there is no subsequent match, causing
+- a backtrack to the verb, a failure is forced. That is, backtracking
+- cannot pass to the left of the verb. However, when one of these verbs
++ tinues with what follows, but if there is no subsequent match, causing
++ a backtrack to the verb, a failure is forced. That is, backtracking
++ cannot pass to the left of the verb. However, when one of these verbs
+ appears inside an atomic group or an assertion that is true, its effect
+- is confined to that group, because once the group has been matched,
+- there is never any backtracking into it. In this situation, backtrack-
+- ing can "jump back" to the left of the entire atomic group or asser-
+- tion. (Remember also, as stated above, that this localization also
++ is confined to that group, because once the group has been matched,
++ there is never any backtracking into it. In this situation, backtrack-
++ ing can "jump back" to the left of the entire atomic group or asser-
++ tion. (Remember also, as stated above, that this localization also
+ applies in subroutine calls.)
+
+- These verbs differ in exactly what kind of failure occurs when back-
+- tracking reaches them. The behaviour described below is what happens
+- when the verb is not in a subroutine or an assertion. Subsequent sec-
++ These verbs differ in exactly what kind of failure occurs when back-
++ tracking reaches them. The behaviour described below is what happens
++ when the verb is not in a subroutine or an assertion. Subsequent sec-
+ tions cover these special cases.
+
+ (*COMMIT)
+
+- This verb, which may not be followed by a name, causes the whole match
++ This verb, which may not be followed by a name, causes the whole match
+ to fail outright if there is a later matching failure that causes back-
+- tracking to reach it. Even if the pattern is unanchored, no further
++ tracking to reach it. Even if the pattern is unanchored, no further
+ attempts to find a match by advancing the starting point take place. If
+- (*COMMIT) is the only backtracking verb that is encountered, once it
++ (*COMMIT) is the only backtracking verb that is encountered, once it
+ has been passed pcre_exec() is committed to finding a match at the cur-
+ rent starting point, or not at all. For example:
+
+ a+(*COMMIT)b
+
+- This matches "xxaab" but not "aacaab". It can be thought of as a kind
++ This matches "xxaab" but not "aacaab". It can be thought of as a kind
+ of dynamic anchor, or "I've started, so I must finish." The name of the
+- most recently passed (*MARK) in the path is passed back when (*COMMIT)
++ most recently passed (*MARK) in the path is passed back when (*COMMIT)
+ forces a match failure.
+
+- If there is more than one backtracking verb in a pattern, a different
+- one that follows (*COMMIT) may be triggered first, so merely passing
++ If there is more than one backtracking verb in a pattern, a different
++ one that follows (*COMMIT) may be triggered first, so merely passing
+ (*COMMIT) during a match does not always guarantee that a match must be
+ at this starting point.
+
+- Note that (*COMMIT) at the start of a pattern is not the same as an
+- anchor, unless PCRE's start-of-match optimizations are turned off, as
++ Note that (*COMMIT) at the start of a pattern is not the same as an
++ anchor, unless PCRE's start-of-match optimizations are turned off, as
+ shown in this output from pcretest:
+
+ re> /(*COMMIT)abc/
+@@ -7432,207 +7456,207 @@ BACKTRACKING CONTROL
+
+ For this pattern, PCRE knows that any match must start with "a", so the
+ optimization skips along the subject to "a" before applying the pattern
+- to the first set of data. The match attempt then succeeds. In the sec-
+- ond set of data, the escape sequence \Y is interpreted by the pcretest
+- program. It causes the PCRE_NO_START_OPTIMIZE option to be set when
++ to the first set of data. The match attempt then succeeds. In the sec-
++ ond set of data, the escape sequence \Y is interpreted by the pcretest
++ program. It causes the PCRE_NO_START_OPTIMIZE option to be set when
+ pcre_exec() is called. This disables the optimization that skips along
+ to the first character. The pattern is now applied starting at "x", and
+- so the (*COMMIT) causes the match to fail without trying any other
++ so the (*COMMIT) causes the match to fail without trying any other
+ starting points.
+
+ (*PRUNE) or (*PRUNE:NAME)
+
+- This verb causes the match to fail at the current starting position in
++ This verb causes the match to fail at the current starting position in
+ the subject if there is a later matching failure that causes backtrack-
+- ing to reach it. If the pattern is unanchored, the normal "bumpalong"
+- advance to the next starting character then happens. Backtracking can
+- occur as usual to the left of (*PRUNE), before it is reached, or when
+- matching to the right of (*PRUNE), but if there is no match to the
+- right, backtracking cannot cross (*PRUNE). In simple cases, the use of
+- (*PRUNE) is just an alternative to an atomic group or possessive quan-
++ ing to reach it. If the pattern is unanchored, the normal "bumpalong"
++ advance to the next starting character then happens. Backtracking can
++ occur as usual to the left of (*PRUNE), before it is reached, or when
++ matching to the right of (*PRUNE), but if there is no match to the
++ right, backtracking cannot cross (*PRUNE). In simple cases, the use of
++ (*PRUNE) is just an alternative to an atomic group or possessive quan-
+ tifier, but there are some uses of (*PRUNE) that cannot be expressed in
+- any other way. In an anchored pattern (*PRUNE) has the same effect as
++ any other way. In an anchored pattern (*PRUNE) has the same effect as
+ (*COMMIT).
+
+ The behaviour of (*PRUNE:NAME) is the not the same as
+- (*MARK:NAME)(*PRUNE). It is like (*MARK:NAME) in that the name is
+- remembered for passing back to the caller. However, (*SKIP:NAME)
++ (*MARK:NAME)(*PRUNE). It is like (*MARK:NAME) in that the name is
++ remembered for passing back to the caller. However, (*SKIP:NAME)
+ searches only for names set with (*MARK).
+
+ (*SKIP)
+
+- This verb, when given without a name, is like (*PRUNE), except that if
+- the pattern is unanchored, the "bumpalong" advance is not to the next
++ This verb, when given without a name, is like (*PRUNE), except that if
++ the pattern is unanchored, the "bumpalong" advance is not to the next
+ character, but to the position in the subject where (*SKIP) was encoun-
+- tered. (*SKIP) signifies that whatever text was matched leading up to
++ tered. (*SKIP) signifies that whatever text was matched leading up to
+ it cannot be part of a successful match. Consider:
+
+ a+(*SKIP)b
+
+- If the subject is "aaaac...", after the first match attempt fails
+- (starting at the first character in the string), the starting point
++ If the subject is "aaaac...", after the first match attempt fails
++ (starting at the first character in the string), the starting point
+ skips on to start the next attempt at "c". Note that a possessive quan-
+- tifer does not have the same effect as this example; although it would
+- suppress backtracking during the first match attempt, the second
+- attempt would start at the second character instead of skipping on to
++ tifer does not have the same effect as this example; although it would
++ suppress backtracking during the first match attempt, the second
++ attempt would start at the second character instead of skipping on to
+ "c".
+
+ (*SKIP:NAME)
+
+ When (*SKIP) has an associated name, its behaviour is modified. When it
+ is triggered, the previous path through the pattern is searched for the
+- most recent (*MARK) that has the same name. If one is found, the
++ most recent (*MARK) that has the same name. If one is found, the
+ "bumpalong" advance is to the subject position that corresponds to that
+ (*MARK) instead of to where (*SKIP) was encountered. If no (*MARK) with
+ a matching name is found, the (*SKIP) is ignored.
+
+- Note that (*SKIP:NAME) searches only for names set by (*MARK:NAME). It
++ Note that (*SKIP:NAME) searches only for names set by (*MARK:NAME). It
+ ignores names that are set by (*PRUNE:NAME) or (*THEN:NAME).
+
+ (*THEN) or (*THEN:NAME)
+
+- This verb causes a skip to the next innermost alternative when back-
+- tracking reaches it. That is, it cancels any further backtracking
+- within the current alternative. Its name comes from the observation
++ This verb causes a skip to the next innermost alternative when back-
++ tracking reaches it. That is, it cancels any further backtracking
++ within the current alternative. Its name comes from the observation
+ that it can be used for a pattern-based if-then-else block:
+
+ ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
+
+- If the COND1 pattern matches, FOO is tried (and possibly further items
+- after the end of the group if FOO succeeds); on failure, the matcher
+- skips to the second alternative and tries COND2, without backtracking
+- into COND1. If that succeeds and BAR fails, COND3 is tried. If subse-
+- quently BAZ fails, there are no more alternatives, so there is a back-
+- track to whatever came before the entire group. If (*THEN) is not
++ If the COND1 pattern matches, FOO is tried (and possibly further items
++ after the end of the group if FOO succeeds); on failure, the matcher
++ skips to the second alternative and tries COND2, without backtracking
++ into COND1. If that succeeds and BAR fails, COND3 is tried. If subse-
++ quently BAZ fails, there are no more alternatives, so there is a back-
++ track to whatever came before the entire group. If (*THEN) is not
+ inside an alternation, it acts like (*PRUNE).
+
+- The behaviour of (*THEN:NAME) is the not the same as
+- (*MARK:NAME)(*THEN). It is like (*MARK:NAME) in that the name is
+- remembered for passing back to the caller. However, (*SKIP:NAME)
++ The behaviour of (*THEN:NAME) is the not the same as
++ (*MARK:NAME)(*THEN). It is like (*MARK:NAME) in that the name is
++ remembered for passing back to the caller. However, (*SKIP:NAME)
+ searches only for names set with (*MARK).
+
+- A subpattern that does not contain a | character is just a part of the
+- enclosing alternative; it is not a nested alternation with only one
+- alternative. The effect of (*THEN) extends beyond such a subpattern to
+- the enclosing alternative. Consider this pattern, where A, B, etc. are
+- complex pattern fragments that do not contain any | characters at this
++ A subpattern that does not contain a | character is just a part of the
++ enclosing alternative; it is not a nested alternation with only one
++ alternative. The effect of (*THEN) extends beyond such a subpattern to
++ the enclosing alternative. Consider this pattern, where A, B, etc. are
++ complex pattern fragments that do not contain any | characters at this
+ level:
+
+ A (B(*THEN)C) | D
+
+- If A and B are matched, but there is a failure in C, matching does not
++ If A and B are matched, but there is a failure in C, matching does not
+ backtrack into A; instead it moves to the next alternative, that is, D.
+- However, if the subpattern containing (*THEN) is given an alternative,
++ However, if the subpattern containing (*THEN) is given an alternative,
+ it behaves differently:
+
+ A (B(*THEN)C | (*FAIL)) | D
+
+- The effect of (*THEN) is now confined to the inner subpattern. After a
++ The effect of (*THEN) is now confined to the inner subpattern. After a
+ failure in C, matching moves to (*FAIL), which causes the whole subpat-
+- tern to fail because there are no more alternatives to try. In this
++ tern to fail because there are no more alternatives to try. In this
+ case, matching does now backtrack into A.
+
+- Note that a conditional subpattern is not considered as having two
+- alternatives, because only one is ever used. In other words, the |
++ Note that a conditional subpattern is not considered as having two
++ alternatives, because only one is ever used. In other words, the |
+ character in a conditional subpattern has a different meaning. Ignoring
+ white space, consider:
+
+ ^.*? (?(?=a) a | b(*THEN)c )
+
+- If the subject is "ba", this pattern does not match. Because .*? is
+- ungreedy, it initially matches zero characters. The condition (?=a)
+- then fails, the character "b" is matched, but "c" is not. At this
+- point, matching does not backtrack to .*? as might perhaps be expected
+- from the presence of the | character. The conditional subpattern is
++ If the subject is "ba", this pattern does not match. Because .*? is
++ ungreedy, it initially matches zero characters. The condition (?=a)
++ then fails, the character "b" is matched, but "c" is not. At this
++ point, matching does not backtrack to .*? as might perhaps be expected
++ from the presence of the | character. The conditional subpattern is
+ part of the single alternative that comprises the whole pattern, and so
+- the match fails. (If there was a backtrack into .*?, allowing it to
++ the match fails. (If there was a backtrack into .*?, allowing it to
+ match "b", the match would succeed.)
+
+- The verbs just described provide four different "strengths" of control
++ The verbs just described provide four different "strengths" of control
+ when subsequent matching fails. (*THEN) is the weakest, carrying on the
+- match at the next alternative. (*PRUNE) comes next, failing the match
+- at the current starting position, but allowing an advance to the next
+- character (for an unanchored pattern). (*SKIP) is similar, except that
++ match at the next alternative. (*PRUNE) comes next, failing the match
++ at the current starting position, but allowing an advance to the next
++ character (for an unanchored pattern). (*SKIP) is similar, except that
+ the advance may be more than one character. (*COMMIT) is the strongest,
+ causing the entire match to fail.
+
+ More than one backtracking verb
+
+- If more than one backtracking verb is present in a pattern, the one
+- that is backtracked onto first acts. For example, consider this pat-
++ If more than one backtracking verb is present in a pattern, the one
++ that is backtracked onto first acts. For example, consider this pat-
+ tern, where A, B, etc. are complex pattern fragments:
+
+ (A(*COMMIT)B(*THEN)C|ABD)
+
+- If A matches but B fails, the backtrack to (*COMMIT) causes the entire
++ If A matches but B fails, the backtrack to (*COMMIT) causes the entire
+ match to fail. However, if A and B match, but C fails, the backtrack to
+- (*THEN) causes the next alternative (ABD) to be tried. This behaviour
+- is consistent, but is not always the same as Perl's. It means that if
+- two or more backtracking verbs appear in succession, all the the last
++ (*THEN) causes the next alternative (ABD) to be tried. This behaviour
++ is consistent, but is not always the same as Perl's. It means that if
++ two or more backtracking verbs appear in succession, all the the last
+ of them has no effect. Consider this example:
+
+ ...(*COMMIT)(*PRUNE)...
+
+ If there is a matching failure to the right, backtracking onto (*PRUNE)
+- causes it to be triggered, and its action is taken. There can never be
++ causes it to be triggered, and its action is taken. There can never be
+ a backtrack onto (*COMMIT).
+
+ Backtracking verbs in repeated groups
+
+- PCRE differs from Perl in its handling of backtracking verbs in
++ PCRE differs from Perl in its handling of backtracking verbs in
+ repeated groups. For example, consider:
+
+ /(a(*COMMIT)b)+ac/
+
+- If the subject is "abac", Perl matches, but PCRE fails because the
++ If the subject is "abac", Perl matches, but PCRE fails because the
+ (*COMMIT) in the second repeat of the group acts.
+
+ Backtracking verbs in assertions
+
+- (*FAIL) in an assertion has its normal effect: it forces an immediate
++ (*FAIL) in an assertion has its normal effect: it forces an immediate
+ backtrack.
+
+ (*ACCEPT) in a positive assertion causes the assertion to succeed with-
+- out any further processing. In a negative assertion, (*ACCEPT) causes
++ out any further processing. In a negative assertion, (*ACCEPT) causes
+ the assertion to fail without any further processing.
+
+- The other backtracking verbs are not treated specially if they appear
+- in a positive assertion. In particular, (*THEN) skips to the next
+- alternative in the innermost enclosing group that has alternations,
++ The other backtracking verbs are not treated specially if they appear
++ in a positive assertion. In particular, (*THEN) skips to the next
++ alternative in the innermost enclosing group that has alternations,
+ whether or not this is within the assertion.
+
+- Negative assertions are, however, different, in order to ensure that
+- changing a positive assertion into a negative assertion changes its
++ Negative assertions are, however, different, in order to ensure that
++ changing a positive assertion into a negative assertion changes its
+ result. Backtracking into (*COMMIT), (*SKIP), or (*PRUNE) causes a neg-
+ ative assertion to be true, without considering any further alternative
+ branches in the assertion. Backtracking into (*THEN) causes it to skip
+- to the next enclosing alternative within the assertion (the normal be-
+- haviour), but if the assertion does not have such an alternative,
++ to the next enclosing alternative within the assertion (the normal be-
++ haviour), but if the assertion does not have such an alternative,
+ (*THEN) behaves like (*PRUNE).
+
+ Backtracking verbs in subroutines
+
+- These behaviours occur whether or not the subpattern is called recur-
++ These behaviours occur whether or not the subpattern is called recur-
+ sively. Perl's treatment of subroutines is different in some cases.
+
+- (*FAIL) in a subpattern called as a subroutine has its normal effect:
++ (*FAIL) in a subpattern called as a subroutine has its normal effect:
+ it forces an immediate backtrack.
+
+- (*ACCEPT) in a subpattern called as a subroutine causes the subroutine
+- match to succeed without any further processing. Matching then contin-
++ (*ACCEPT) in a subpattern called as a subroutine causes the subroutine
++ match to succeed without any further processing. Matching then contin-
+ ues after the subroutine call.
+
+ (*COMMIT), (*SKIP), and (*PRUNE) in a subpattern called as a subroutine
+ cause the subroutine match to fail.
+
+- (*THEN) skips to the next alternative in the innermost enclosing group
+- within the subpattern that has alternatives. If there is no such group
++ (*THEN) skips to the next alternative in the innermost enclosing group
++ within the subpattern that has alternatives. If there is no such group
+ within the subpattern, (*THEN) causes the subroutine match to fail.
+
+
+ SEE ALSO
+
+- pcreapi(3), pcrecallout(3), pcrematching(3), pcresyntax(3), pcre(3),
++ pcreapi(3), pcrecallout(3), pcrematching(3), pcresyntax(3), pcre(3),
+ pcre16(3), pcre32(3).
+
+
+@@ -7645,8 +7669,8 @@ AUTHOR
+
+ REVISION
+
+- Last updated: 08 January 2014
+- Copyright (c) 1997-2014 University of Cambridge.
++ Last updated: 14 June 2015
++ Copyright (c) 1997-2015 University of Cambridge.
+ ------------------------------------------------------------------------------
+
+
+diff --git a/ext/pcre/pcrelib/pcre.h b/ext/pcre/pcrelib/pcre.h
+index 58ed46a..bf6351f 100644
+--- a/ext/pcre/pcrelib/pcre.h
++++ b/ext/pcre/pcrelib/pcre.h
+@@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
+ /* The current PCRE version information. */
+
+ #define PCRE_MAJOR 8
+-#define PCRE_MINOR 37
++#define PCRE_MINOR 38
+ #define PCRE_PRERELEASE
+-#define PCRE_DATE 2015-04-28
++#define PCRE_DATE 2015-11-23
+
+ /* When an application links to a PCRE DLL in Windows, the symbols that are
+ imported have to be identified as such. When building PCRE, the appropriate
+diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c
+index 0efad26..4d3b313 100644
+--- a/ext/pcre/pcrelib/pcre_compile.c
++++ b/ext/pcre/pcrelib/pcre_compile.c
+@@ -174,7 +174,7 @@ static const short int escapes[] = {
+ -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
+ CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
+ CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
+- CHAR_GRAVE_ACCENT, 7,
++ CHAR_GRAVE_ACCENT, ESC_a,
+ -ESC_b, 0,
+ -ESC_d, ESC_e,
+ ESC_f, 0,
+@@ -202,9 +202,9 @@ static const short int escapes[] = {
+ /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
+ /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
+-/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
++/* 80 */ 0, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
+ /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
+-/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
++/* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p,
+ /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
+ /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
+ /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
+@@ -219,6 +219,12 @@ static const short int escapes[] = {
+ /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
+ };
++
++/* We also need a table of characters that may follow \c in an EBCDIC
++environment for characters 0-31. */
++
++static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
++
+ #endif
+
+
+@@ -458,7 +464,7 @@ static const char error_texts[] =
+ "range out of order in character class\0"
+ "nothing to repeat\0"
+ /* 10 */
+- "operand of unlimited repeat could match the empty string\0" /** DEAD **/
++ "internal error: invalid forward reference offset\0"
+ "internal error: unexpected repeat\0"
+ "unrecognized character after (? or (?-\0"
+ "POSIX named classes are supported only within a class\0"
+@@ -527,7 +533,11 @@ static const char error_texts[] =
+ "different names for subpatterns of the same number are not allowed\0"
+ "(*MARK) must have an argument\0"
+ "this version of PCRE is not compiled with Unicode property support\0"
++#ifndef EBCDIC
+ "\\c must be followed by an ASCII character\0"
++#else
++ "\\c must be followed by a letter or one of [\\]^_?\0"
++#endif
+ "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
+ /* 70 */
+ "internal error: unknown opcode in find_fixedlength()\0"
+@@ -1425,7 +1435,16 @@ else
+ c ^= 0x40;
+ #else /* EBCDIC coding */
+ if (c >= CHAR_a && c <= CHAR_z) c += 64;
+- c ^= 0xC0;
++ if (c == CHAR_QUESTION_MARK)
++ c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
++ else
++ {
++ for (i = 0; i < 32; i++)
++ {
++ if (c == ebcdic_escape_c[i]) break;
++ }
++ if (i < 32) c = i; else *errorcodeptr = ERR68;
++ }
+ #endif
+ break;
+
+@@ -1799,7 +1818,7 @@ for (;;)
+ case OP_ASSERTBACK:
+ case OP_ASSERTBACK_NOT:
+ do cc += GET(cc, 1); while (*cc == OP_ALT);
+- cc += PRIV(OP_lengths)[*cc];
++ cc += 1 + LINK_SIZE;
+ break;
+
+ /* Skip over things that don't match chars */
+@@ -2487,7 +2506,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
+ if (c == OP_BRA || c == OP_BRAPOS ||
+ c == OP_CBRA || c == OP_CBRAPOS ||
+ c == OP_ONCE || c == OP_ONCE_NC ||
+- c == OP_COND)
++ c == OP_COND || c == OP_SCOND)
+ {
+ BOOL empty_branch;
+ if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
+@@ -3886,11 +3905,11 @@ didn't consider this to be a POSIX class. Likewise for [:1234:].
+ The problem in trying to be exactly like Perl is in the handling of escapes. We
+ have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
+ class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
+-below handles the special case of \], but does not try to do any other escape
+-processing. This makes it different from Perl for cases such as [:l\ower:]
+-where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
+-"l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
+-I think.
++below handles the special cases \\ and \], but does not try to do any other
++escape processing. This makes it different from Perl for cases such as
++[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
++not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
++when Perl does, I think.
+
+ A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
+ It seems that the appearance of a nested POSIX class supersedes an apparent
+@@ -3917,21 +3936,16 @@ pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
+ terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
+ for (++ptr; *ptr != CHAR_NULL; ptr++)
+ {
+- if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
++ if (*ptr == CHAR_BACKSLASH &&
++ (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
++ ptr[1] == CHAR_BACKSLASH))
+ ptr++;
+- else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
+- else
++ else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
++ *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
++ else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
+ {
+- if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
+- {
+- *endptr = ptr;
+- return TRUE;
+- }
+- if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
+- (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
+- ptr[1] == CHAR_EQUALS_SIGN) &&
+- check_posix_syntax(ptr, endptr))
+- return FALSE;
++ *endptr = ptr;
++ return TRUE;
+ }
+ }
+ return FALSE;
+@@ -3985,11 +3999,12 @@ have their offsets adjusted. That one of the jobs of this function. Before it
+ is called, the partially compiled regex must be temporarily terminated with
+ OP_END.
+
+-This function has been extended with the possibility of forward references for
+-recursions and subroutine calls. It must also check the list of such references
+-for the group we are dealing with. If it finds that one of the recursions in
+-the current group is on this list, it adjusts the offset in the list, not the
+-value in the reference (which is a group number).
++This function has been extended to cope with forward references for recursions
++and subroutine calls. It must check the list of such references for the
++group we are dealing with. If it finds that one of the recursions in the
++current group is on this list, it does not adjust the value in the reference
++(which is a group number). After the group has been scanned, all the offsets in
++the forward reference list for the group are adjusted.
+
+ Arguments:
+ group points to the start of the group
+@@ -4005,29 +4020,21 @@ static void
+ adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
+ size_t save_hwm_offset)
+ {
++int offset;
++pcre_uchar *hc;
+ pcre_uchar *ptr = group;
+
+ while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
+ {
+- int offset;
+- pcre_uchar *hc;
+-
+- /* See if this recursion is on the forward reference list. If so, adjust the
+- reference. */
+-
+ for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
+ hc += LINK_SIZE)
+ {
+ offset = (int)GET(hc, 0);
+- if (cd->start_code + offset == ptr + 1)
+- {
+- PUT(hc, 0, offset + adjust);
+- break;
+- }
++ if (cd->start_code + offset == ptr + 1) break;
+ }
+
+- /* Otherwise, adjust the recursion offset if it's after the start of this
+- group. */
++ /* If we have not found this recursion on the forward reference list, adjust
++ the recursion's offset if it's after the start of this group. */
+
+ if (hc >= cd->hwm)
+ {
+@@ -4037,6 +4044,15 @@ while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
+
+ ptr += 1 + LINK_SIZE;
+ }
++
++/* Now adjust all forward reference offsets for the group. */
++
++for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
++ hc += LINK_SIZE)
++ {
++ offset = (int)GET(hc, 0);
++ PUT(hc, 0, offset + adjust);
++ }
+ }
+
+
+@@ -4465,7 +4481,7 @@ const pcre_uchar *tempptr;
+ const pcre_uchar *nestptr = NULL;
+ pcre_uchar *previous = NULL;
+ pcre_uchar *previous_callout = NULL;
+-size_t save_hwm_offset = 0;
++size_t item_hwm_offset = 0;
+ pcre_uint8 classbits[32];
+
+ /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
+@@ -4623,8 +4639,7 @@ for (;; ptr++)
+ /* In the real compile phase, just check the workspace used by the forward
+ reference list. */
+
+- else if (cd->hwm > cd->start_workspace + cd->workspace_size -
+- WORK_SIZE_SAFETY_MARGIN)
++ else if (cd->hwm > cd->start_workspace + cd->workspace_size)
+ {
+ *errorcodeptr = ERR52;
+ goto FAILED;
+@@ -4767,6 +4782,7 @@ for (;; ptr++)
+ zeroreqchar = reqchar;
+ zeroreqcharflags = reqcharflags;
+ previous = code;
++ item_hwm_offset = cd->hwm - cd->start_workspace;
+ *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
+ break;
+
+@@ -4818,6 +4834,7 @@ for (;; ptr++)
+ /* Handle a real character class. */
+
+ previous = code;
++ item_hwm_offset = cd->hwm - cd->start_workspace;
+
+ /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
+ they are encountered at the top level, so we'll do that too. */
+@@ -4923,9 +4940,10 @@ for (;; ptr++)
+ (which is on the stack). We have to remember that there was XCLASS data,
+ however. */
+
++ if (class_uchardata > class_uchardata_base) xclass = TRUE;
++
+ if (lengthptr != NULL && class_uchardata > class_uchardata_base)
+ {
+- xclass = TRUE;
+ *lengthptr += (int)(class_uchardata - class_uchardata_base);
+ class_uchardata = class_uchardata_base;
+ }
+@@ -5028,10 +5046,26 @@ for (;; ptr++)
+ ptr = tempptr + 1;
+ continue;
+
+- /* For all other POSIX classes, no special action is taken in UCP
+- mode. Fall through to the non_UCP case. */
++ /* For the other POSIX classes (ascii, xdigit) we are going to fall
++ through to the non-UCP case and build a bit map for characters with
++ code points less than 256. If we are in a negated POSIX class
++ within a non-negated overall class, characters with code points
++ greater than 255 must all match. In the special case where we have
++ not yet generated any xclass data, and this is the final item in
++ the overall class, we need do nothing: later on, the opcode
++ OP_NCLASS will be used to indicate that characters greater than 255
++ are acceptable. If we have already seen an xclass item or one may
++ follow (we have to assume that it might if this is not the end of
++ the class), explicitly match all wide codepoints. */
+
+ default:
++ if (!negate_class && local_negate &&
++ (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
++ {
++ *class_uchardata++ = XCL_RANGE;
++ class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
++ class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
++ }
+ break;
+ }
+ }
+@@ -5195,9 +5229,9 @@ for (;; ptr++)
+ cd, PRIV(vspace_list));
+ continue;
+
+-#ifdef SUPPORT_UCP
+ case ESC_p:
+ case ESC_P:
++#ifdef SUPPORT_UCP
+ {
+ BOOL negated;
+ unsigned int ptype = 0, pdata = 0;
+@@ -5211,6 +5245,9 @@ for (;; ptr++)
+ class_has_8bitchar--; /* Undo! */
+ continue;
+ }
++#else
++ *errorcodeptr = ERR45;
++ goto FAILED;
+ #endif
+ /* Unrecognized escapes are faulted if PCRE is running in its
+ strict mode. By default, for compatibility with Perl, they are
+@@ -5367,16 +5404,20 @@ for (;; ptr++)
+ CLASS_SINGLE_CHARACTER:
+ if (class_one_char < 2) class_one_char++;
+
+- /* If class_one_char is 1, we have the first single character in the
+- class, and there have been no prior ranges, or XCLASS items generated by
+- escapes. If this is the final character in the class, we can optimize by
+- turning the item into a 1-character OP_CHAR[I] if it's positive, or
+- OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
+- to be set. Otherwise, there can be no first char if this item is first,
+- whatever repeat count may follow. In the case of reqchar, save the
+- previous value for reinstating. */
++ /* If xclass_has_prop is false and class_one_char is 1, we have the first
++ single character in the class, and there have been no prior ranges, or
++ XCLASS items generated by escapes. If this is the final character in the
++ class, we can optimize by turning the item into a 1-character OP_CHAR[I]
++ if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
++ can cause firstchar to be set. Otherwise, there can be no first char if
++ this item is first, whatever repeat count may follow. In the case of
++ reqchar, save the previous value for reinstating. */
+
+- if (!inescq && class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
++ if (!inescq &&
++#ifdef SUPPORT_UCP
++ !xclass_has_prop &&
++#endif
++ class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
+ {
+ ptr++;
+ zeroreqchar = reqchar;
+@@ -5492,9 +5533,10 @@ for (;; ptr++)
+ actual compiled code. */
+
+ #ifdef SUPPORT_UTF
+- if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
++ if (xclass && (xclass_has_prop || !should_flip_negation ||
++ (options & PCRE_UCP) != 0))
+ #elif !defined COMPILE_PCRE8
+- if (xclass && !should_flip_negation)
++ if (xclass && (xclass_has_prop || !should_flip_negation))
+ #endif
+ #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+ {
+@@ -5930,7 +5972,7 @@ for (;; ptr++)
+ {
+ register int i;
+ int len = (int)(code - previous);
+- size_t base_hwm_offset = save_hwm_offset;
++ size_t base_hwm_offset = item_hwm_offset;
+ pcre_uchar *bralink = NULL;
+ pcre_uchar *brazeroptr = NULL;
+
+@@ -5985,7 +6027,7 @@ for (;; ptr++)
+ if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
+ {
+ *code = OP_END;
+- adjust_recurse(previous, 1, utf, cd, save_hwm_offset);
++ adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
+ memmove(previous + 1, previous, IN_UCHARS(len));
+ code++;
+ if (repeat_max == 0)
+@@ -6009,7 +6051,7 @@ for (;; ptr++)
+ {
+ int offset;
+ *code = OP_END;
+- adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm_offset);
++ adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
+ memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
+ code += 2 + LINK_SIZE;
+ *previous++ = OP_BRAZERO + repeat_type;
+@@ -6254,6 +6296,12 @@ for (;; ptr++)
+ while (*scode == OP_ALT);
+ }
+
++ /* A conditional group with only one branch has an implicit empty
++ alternative branch. */
++
++ if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
++ *bracode = OP_SCOND;
++
+ /* Handle possessive quantifiers. */
+
+ if (possessive_quantifier)
+@@ -6267,11 +6315,11 @@ for (;; ptr++)
+ {
+ int nlen = (int)(code - bracode);
+ *code = OP_END;
+- adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
++ adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
+ memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
+ code += 1 + LINK_SIZE;
+ nlen += 1 + LINK_SIZE;
+- *bracode = OP_BRAPOS;
++ *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
+ *code++ = OP_KETRPOS;
+ PUTINC(code, 0, nlen);
+ PUT(bracode, 1, nlen);
+@@ -6401,7 +6449,7 @@ for (;; ptr++)
+ else
+ {
+ *code = OP_END;
+- adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
++ adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
+ memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
+ code += 1 + LINK_SIZE;
+ len += 1 + LINK_SIZE;
+@@ -6450,7 +6498,7 @@ for (;; ptr++)
+
+ default:
+ *code = OP_END;
+- adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
++ adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
+ memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
+ code += 1 + LINK_SIZE;
+ len += 1 + LINK_SIZE;
+@@ -6586,9 +6634,17 @@ for (;; ptr++)
+ goto FAILED;
+ }
+ setverb = *code++ = verbs[i].op_arg;
+- *code++ = arglen;
+- memcpy(code, arg, IN_UCHARS(arglen));
+- code += arglen;
++ if (lengthptr != NULL) /* In pass 1 just add in the length */
++ { /* to avoid potential workspace */
++ *lengthptr += arglen; /* overflow. */
++ *code++ = 0;
++ }
++ else
++ {
++ *code++ = arglen;
++ memcpy(code, arg, IN_UCHARS(arglen));
++ code += arglen;
++ }
+ *code++ = 0;
+ }
+
+@@ -6623,7 +6679,7 @@ for (;; ptr++)
+ newoptions = options;
+ skipbytes = 0;
+ bravalue = OP_CBRA;
+- save_hwm_offset = cd->hwm - cd->start_workspace;
++ item_hwm_offset = cd->hwm - cd->start_workspace;
+ reset_bracount = FALSE;
+
+ /* Deal with the extended parentheses; all are introduced by '?', and the
+@@ -6641,6 +6697,7 @@ for (;; ptr++)
+ /* ------------------------------------------------------------ */
+ case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
+ reset_bracount = TRUE;
++ cd->dupgroups = TRUE; /* Record (?| encountered */
+ /* Fall through */
+
+ /* ------------------------------------------------------------ */
+@@ -6741,6 +6798,12 @@ for (;; ptr++)
+ {
+ while (IS_DIGIT(*ptr))
+ {
++ if (recno > INT_MAX / 10 - 1) /* Integer overflow */
++ {
++ while (IS_DIGIT(*ptr)) ptr++;
++ *errorcodeptr = ERR61;
++ goto FAILED;
++ }
+ recno = recno * 10 + (int)(*ptr - CHAR_0);
+ ptr++;
+ }
+@@ -6769,7 +6832,7 @@ for (;; ptr++)
+ ptr++;
+ }
+ namelen = (int)(ptr - name);
+- if (lengthptr != NULL) *lengthptr += IMM2_SIZE;
++ if (lengthptr != NULL) skipbytes += IMM2_SIZE;
+ }
+
+ /* Check the terminator */
+@@ -6875,6 +6938,11 @@ for (;; ptr++)
+ *errorcodeptr = ERR15;
+ goto FAILED;
+ }
++ if (recno > INT_MAX / 10 - 1) /* Integer overflow */
++ {
++ *errorcodeptr = ERR61;
++ goto FAILED;
++ }
+ recno = recno * 10 + name[i] - CHAR_0;
+ }
+ if (recno == 0) recno = RREF_ANY;
+@@ -7151,6 +7219,7 @@ for (;; ptr++)
+ if (lengthptr != NULL)
+ {
+ named_group *ng;
++ recno = 0;
+
+ if (namelen == 0)
+ {
+@@ -7168,20 +7237,6 @@ for (;; ptr++)
+ goto FAILED;
+ }
+
+- /* The name table does not exist in the first pass; instead we must
+- scan the list of names encountered so far in order to get the
+- number. If the name is not found, set the value to 0 for a forward
+- reference. */
+-
+- ng = cd->named_groups;
+- for (i = 0; i < cd->names_found; i++, ng++)
+- {
+- if (namelen == ng->length &&
+- STRNCMP_UC_UC(name, ng->name, namelen) == 0)
+- break;
+- }
+- recno = (i < cd->names_found)? ng->number : 0;
+-
+ /* Count named back references. */
+
+ if (!is_recurse) cd->namedrefcount++;
+@@ -7191,6 +7246,56 @@ for (;; ptr++)
+ 16-bit data item. */
+
+ *lengthptr += IMM2_SIZE;
++
++ /* If this is a forward reference and we are within a (?|...) group,
++ the reference may end up as the number of a group which we are
++ currently inside, that is, it could be a recursive reference. In the
++ real compile this will be picked up and the reference wrapped with
++ OP_ONCE to make it atomic, so we must space in case this occurs. */
++
++ /* In fact, this can happen for a non-forward reference because
++ another group with the same number might be created later. This
++ issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
++ only mode, we finesse the bug by allowing more memory always. */
++
++ *lengthptr += 2 + 2*LINK_SIZE;
++
++ /* It is even worse than that. The current reference may be to an
++ existing named group with a different number (so apparently not
++ recursive) but which later on is also attached to a group with the
++ current number. This can only happen if $(| has been previous
++ encountered. In that case, we allow yet more memory, just in case.
++ (Again, this is fixed "properly" in PCRE2. */
++
++ if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
++
++ /* Otherwise, check for recursion here. The name table does not exist
++ in the first pass; instead we must scan the list of names encountered
++ so far in order to get the number. If the name is not found, leave
++ the value of recno as 0 for a forward reference. */
++
++ else
++ {
++ ng = cd->named_groups;
++ for (i = 0; i < cd->names_found; i++, ng++)
++ {
++ if (namelen == ng->length &&
++ STRNCMP_UC_UC(name, ng->name, namelen) == 0)
++ {
++ open_capitem *oc;
++ recno = ng->number;
++ if (is_recurse) break;
++ for (oc = cd->open_caps; oc != NULL; oc = oc->next)
++ {
++ if (oc->number == recno)
++ {
++ oc->flag = TRUE;
++ break;
++ }
++ }
++ }
++ }
++ }
+ }
+
+ /* In the real compile, search the name table. We check the name
+@@ -7237,8 +7342,6 @@ for (;; ptr++)
+ for (i++; i < cd->names_found; i++)
+ {
+ if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
+-
+-
+ count++;
+ cslot += cd->name_entry_size;
+ }
+@@ -7247,6 +7350,7 @@ for (;; ptr++)
+ {
+ if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
+ previous = code;
++ item_hwm_offset = cd->hwm - cd->start_workspace;
+ *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
+ PUT2INC(code, 0, index);
+ PUT2INC(code, 0, count);
+@@ -7284,9 +7388,14 @@ for (;; ptr++)
+
+
+ /* ------------------------------------------------------------ */
+- case CHAR_R: /* Recursion */
+- ptr++; /* Same as (?0) */
+- /* Fall through */
++ case CHAR_R: /* Recursion, same as (?0) */
++ recno = 0;
++ if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
++ {
++ *errorcodeptr = ERR29;
++ goto FAILED;
++ }
++ goto HANDLE_RECURSION;
+
+
+ /* ------------------------------------------------------------ */
+@@ -7323,7 +7432,15 @@ for (;; ptr++)
+
+ recno = 0;
+ while(IS_DIGIT(*ptr))
++ {
++ if (recno > INT_MAX / 10 - 1) /* Integer overflow */
++ {
++ while (IS_DIGIT(*ptr)) ptr++;
++ *errorcodeptr = ERR61;
++ goto FAILED;
++ }
+ recno = recno * 10 + *ptr++ - CHAR_0;
++ }
+
+ if (*ptr != (pcre_uchar)terminator)
+ {
+@@ -7360,6 +7477,7 @@ for (;; ptr++)
+ HANDLE_RECURSION:
+
+ previous = code;
++ item_hwm_offset = cd->hwm - cd->start_workspace;
+ called = cd->start_code;
+
+ /* When we are actually compiling, find the bracket that is being
+@@ -7561,7 +7679,11 @@ for (;; ptr++)
+ previous = NULL;
+ cd->iscondassert = FALSE;
+ }
+- else previous = code;
++ else
++ {
++ previous = code;
++ item_hwm_offset = cd->hwm - cd->start_workspace;
++ }
+
+ *code = bravalue;
+ tempcode = code;
+@@ -7809,7 +7931,7 @@ for (;; ptr++)
+ const pcre_uchar *p;
+ pcre_uint32 cf;
+
+- save_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */
++ item_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */
+ terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
+ CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
+
+@@ -7838,7 +7960,7 @@ for (;; ptr++)
+ if (*p != (pcre_uchar)terminator)
+ {
+ *errorcodeptr = ERR57;
+- break;
++ goto FAILED;
+ }
+ ptr++;
+ goto HANDLE_NUMERICAL_RECURSION;
+@@ -7853,7 +7975,7 @@ for (;; ptr++)
+ ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
+ {
+ *errorcodeptr = ERR69;
+- break;
++ goto FAILED;
+ }
+ is_recurse = FALSE;
+ terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
+@@ -7877,6 +7999,7 @@ for (;; ptr++)
+ HANDLE_REFERENCE:
+ if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
+ previous = code;
++ item_hwm_offset = cd->hwm - cd->start_workspace;
+ *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
+ PUT2INC(code, 0, recno);
+ cd->backref_map |= (recno < 32)? (1 << recno) : 1;
+@@ -7906,6 +8029,7 @@ for (;; ptr++)
+ if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
+ goto FAILED;
+ previous = code;
++ item_hwm_offset = cd->hwm - cd->start_workspace;
+ *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
+ *code++ = ptype;
+ *code++ = pdata;
+@@ -7946,6 +8070,7 @@ for (;; ptr++)
+
+ {
+ previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
++ item_hwm_offset = cd->hwm - cd->start_workspace;
+ *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
+ }
+ }
+@@ -7989,6 +8114,7 @@ for (;; ptr++)
+
+ ONE_CHAR:
+ previous = code;
++ item_hwm_offset = cd->hwm - cd->start_workspace;
+
+ /* For caseless UTF-8 mode when UCP support is available, check whether
+ this character has more than one other case. If so, generate a special
+@@ -9164,6 +9290,7 @@ cd->names_found = 0;
+ cd->name_entry_size = 0;
+ cd->name_table = NULL;
+ cd->dupnames = FALSE;
++cd->dupgroups = FALSE;
+ cd->namedrefcount = 0;
+ cd->start_code = cworkspace;
+ cd->hwm = cworkspace;
+@@ -9336,6 +9463,16 @@ if (cd->hwm > cd->start_workspace)
+ int offset, recno;
+ cd->hwm -= LINK_SIZE;
+ offset = GET(cd->hwm, 0);
++
++ /* Check that the hwm handling hasn't gone wrong. This whole area is
++ rewritten in PCRE2 because there are some obscure cases. */
++
++ if (offset == 0 || codestart[offset-1] != OP_RECURSE)
++ {
++ errorcode = ERR10;
++ break;
++ }
++
+ recno = GET(codestart, offset);
+ if (recno != prev_recno)
+ {
+@@ -9366,7 +9503,7 @@ used in this code because at least one compiler gives a warning about loss of
+ "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
+ function call. */
+
+-if ((options & PCRE_NO_AUTO_POSSESS) == 0)
++if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
+ {
+ pcre_uchar *temp = (pcre_uchar *)codestart;
+ auto_possessify(temp, utf, cd);
+@@ -9380,7 +9517,7 @@ OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
+ exceptional ones forgo this. We scan the pattern to check that they are fixed
+ length, and set their lengths. */
+
+-if (cd->check_lookbehind)
++if (errorcode == 0 && cd->check_lookbehind)
+ {
+ pcre_uchar *cc = (pcre_uchar *)codestart;
+
+@@ -9593,4 +9730,3 @@ return (pcre32 *)re;
+ }
+
+ /* End of pcre_compile.c */
+-
+diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c
+index 3942076..24b23ca 100644
+--- a/ext/pcre/pcrelib/pcre_exec.c
++++ b/ext/pcre/pcrelib/pcre_exec.c
+@@ -688,7 +688,7 @@ the alternative names that are used. */
+ #define foc number
+ #define save_mark data
+
+-/* These statements are here to stop the compiler complaining about uninitialized
++/* These statements are here to stop the compiler complaining about unitialized
+ variables. */
+
+ #ifdef SUPPORT_UCP
+@@ -6685,7 +6685,8 @@ if (md->offset_vector != NULL)
+ register int *iend = iptr - re->top_bracket;
+ if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
+ while (--iptr >= iend) *iptr = -1;
+- md->offset_vector[0] = md->offset_vector[1] = -1;
++ if (offsetcount > 0) md->offset_vector[0] = -1;
++ if (offsetcount > 1) md->offset_vector[1] = -1;
+ }
+
+ /* Set up the first character to match, if available. The first_char value is
+diff --git a/ext/pcre/pcrelib/pcre_internal.h b/ext/pcre/pcrelib/pcre_internal.h
+index 4c4817d..aec1879 100644
+--- a/ext/pcre/pcrelib/pcre_internal.h
++++ b/ext/pcre/pcrelib/pcre_internal.h
+@@ -988,7 +988,7 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */
+ #ifndef EBCDIC
+
+ #define HSPACE_LIST \
+- CHAR_HT, CHAR_SPACE, 0xa0, \
++ CHAR_HT, CHAR_SPACE, CHAR_NBSP, \
+ 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
+ 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \
+ NOTACHAR
+@@ -1014,7 +1014,7 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */
+ #define HSPACE_BYTE_CASES \
+ case CHAR_HT: \
+ case CHAR_SPACE: \
+- case 0xa0 /* NBSP */
++ case CHAR_NBSP
+
+ #define HSPACE_CASES \
+ HSPACE_BYTE_CASES: \
+@@ -1041,11 +1041,12 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */
+ /* ------ EBCDIC environments ------ */
+
+ #else
+-#define HSPACE_LIST CHAR_HT, CHAR_SPACE
++#define HSPACE_LIST CHAR_HT, CHAR_SPACE, CHAR_NBSP, NOTACHAR
+
+ #define HSPACE_BYTE_CASES \
+ case CHAR_HT: \
+- case CHAR_SPACE
++ case CHAR_SPACE: \
++ case CHAR_NBSP
+
+ #define HSPACE_CASES HSPACE_BYTE_CASES
+
+@@ -1219,6 +1220,7 @@ same code point. */
+
+ #define CHAR_ESC '\047'
+ #define CHAR_DEL '\007'
++#define CHAR_NBSP '\x41'
+ #define STR_ESC "\047"
+ #define STR_DEL "\007"
+
+@@ -1233,6 +1235,7 @@ a positive value. */
+ #define CHAR_NEL ((unsigned char)'\x85')
+ #define CHAR_ESC '\033'
+ #define CHAR_DEL '\177'
++#define CHAR_NBSP ((unsigned char)'\xa0')
+
+ #define STR_LF "\n"
+ #define STR_NL STR_LF
+@@ -1610,6 +1613,7 @@ only. */
+ #define CHAR_VERTICAL_LINE '\174'
+ #define CHAR_RIGHT_CURLY_BRACKET '\175'
+ #define CHAR_TILDE '\176'
++#define CHAR_NBSP ((unsigned char)'\xa0')
+
+ #define STR_HT "\011"
+ #define STR_VT "\013"
+@@ -1766,6 +1770,10 @@ only. */
+
+ /* Escape items that are just an encoding of a particular data value. */
+
++#ifndef ESC_a
++#define ESC_a CHAR_BEL
++#endif
++
+ #ifndef ESC_e
+ #define ESC_e CHAR_ESC
+ #endif
+@@ -2450,6 +2458,7 @@ typedef struct compile_data {
+ BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
+ BOOL check_lookbehind; /* Lookbehinds need later checking */
+ BOOL dupnames; /* Duplicate names exist */
++ BOOL dupgroups; /* Duplicate groups exist: (?| found */
+ BOOL iscondassert; /* Next assert is a condition */
+ int nltype; /* Newline type */
+ int nllen; /* Newline string length */
+diff --git a/ext/pcre/pcrelib/pcre_jit_compile.c b/ext/pcre/pcrelib/pcre_jit_compile.c
+index debdf6e..445de0c 100644
+--- a/ext/pcre/pcrelib/pcre_jit_compile.c
++++ b/ext/pcre/pcrelib/pcre_jit_compile.c
+@@ -1064,6 +1064,7 @@ pcre_uchar *alternative;
+ pcre_uchar *end = NULL;
+ int private_data_ptr = *private_data_start;
+ int space, size, bracketlen;
++BOOL repeat_check = TRUE;
+
+ while (cc < ccend)
+ {
+@@ -1071,9 +1072,10 @@ while (cc < ccend)
+ size = 0;
+ bracketlen = 0;
+ if (private_data_ptr > SLJIT_MAX_LOCAL_SIZE)
+- return;
++ break;
+
+- if (*cc == OP_ONCE || *cc == OP_ONCE_NC || *cc == OP_BRA || *cc == OP_CBRA || *cc == OP_COND)
++ if (repeat_check && (*cc == OP_ONCE || *cc == OP_ONCE_NC || *cc == OP_BRA || *cc == OP_CBRA || *cc == OP_COND))
++ {
+ if (detect_repeat(common, cc))
+ {
+ /* These brackets are converted to repeats, so no global
+@@ -1081,6 +1083,8 @@ while (cc < ccend)
+ if (cc >= end)
+ end = bracketend(cc);
+ }
++ }
++ repeat_check = TRUE;
+
+ switch(*cc)
+ {
+@@ -1136,6 +1140,13 @@ while (cc < ccend)
+ bracketlen = 1 + LINK_SIZE + IMM2_SIZE;
+ break;
+
++ case OP_BRAZERO:
++ case OP_BRAMINZERO:
++ case OP_BRAPOSZERO:
++ repeat_check = FALSE;
++ size = 1;
++ break;
++
+ CASE_ITERATOR_PRIVATE_DATA_1
+ space = 1;
+ size = -2;
+@@ -1162,12 +1173,17 @@ while (cc < ccend)
+ size = 1;
+ break;
+
+- CASE_ITERATOR_TYPE_PRIVATE_DATA_2B
++ case OP_TYPEUPTO:
+ if (cc[1 + IMM2_SIZE] != OP_ANYNL && cc[1 + IMM2_SIZE] != OP_EXTUNI)
+ space = 2;
+ size = 1 + IMM2_SIZE;
+ break;
+
++ case OP_TYPEMINUPTO:
++ space = 2;
++ size = 1 + IMM2_SIZE;
++ break;
++
+ case OP_CLASS:
+ case OP_NCLASS:
+ size += 1 + 32 / sizeof(pcre_uchar);
+@@ -1316,6 +1332,13 @@ while (cc < ccend)
+ cc += 1 + LINK_SIZE + IMM2_SIZE;
+ break;
+
++ case OP_THEN:
++ stack_restore = TRUE;
++ if (common->control_head_ptr != 0)
++ *needs_control_head = TRUE;
++ cc ++;
++ break;
++
+ default:
+ stack_restore = TRUE;
+ /* Fall through. */
+@@ -2220,6 +2243,7 @@ while (current != NULL)
+ SLJIT_ASSERT_STOP();
+ break;
+ }
++ SLJIT_ASSERT(current > (sljit_sw*)current[-1]);
+ current = (sljit_sw*)current[-1];
+ }
+ return -1;
+@@ -3209,7 +3233,7 @@ bytes[len] = byte;
+ bytes[0] = len;
+ }
+
+-static int scan_prefix(compiler_common *common, pcre_uchar *cc, pcre_uint32 *chars, pcre_uint8 *bytes, int max_chars)
++static int scan_prefix(compiler_common *common, pcre_uchar *cc, pcre_uint32 *chars, pcre_uint8 *bytes, int max_chars, pcre_uint32 *rec_count)
+ {
+ /* Recursive function, which scans prefix literals. */
+ BOOL last, any, caseless;
+@@ -3227,9 +3251,14 @@ pcre_uchar othercase[1];
+ repeat = 1;
+ while (TRUE)
+ {
++ if (*rec_count == 0)
++ return 0;
++ (*rec_count)--;
++
+ last = TRUE;
+ any = FALSE;
+ caseless = FALSE;
++
+ switch (*cc)
+ {
+ case OP_CHARI:
+@@ -3291,7 +3320,7 @@ while (TRUE)
+ #ifdef SUPPORT_UTF
+ if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc);
+ #endif
+- max_chars = scan_prefix(common, cc + len, chars, bytes, max_chars);
++ max_chars = scan_prefix(common, cc + len, chars, bytes, max_chars, rec_count);
+ if (max_chars == 0)
+ return consumed;
+ last = FALSE;
+@@ -3314,7 +3343,7 @@ while (TRUE)
+ alternative = cc + GET(cc, 1);
+ while (*alternative == OP_ALT)
+ {
+- max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, bytes, max_chars);
++ max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, bytes, max_chars, rec_count);
+ if (max_chars == 0)
+ return consumed;
+ alternative += GET(alternative, 1);
+@@ -3556,6 +3585,7 @@ int i, max, from;
+ int range_right = -1, range_len = 3 - 1;
+ sljit_ub *update_table = NULL;
+ BOOL in_range;
++pcre_uint32 rec_count;
+
+ for (i = 0; i < MAX_N_CHARS; i++)
+ {
+@@ -3564,7 +3594,8 @@ for (i = 0; i < MAX_N_CHARS; i++)
+ bytes[i * MAX_N_BYTES] = 0;
+ }
+
+-max = scan_prefix(common, common->start, chars, bytes, MAX_N_CHARS);
++rec_count = 10000;
++max = scan_prefix(common, common->start, chars, bytes, MAX_N_CHARS, &rec_count);
+
+ if (max <= 1)
+ return FALSE;
+@@ -4311,8 +4342,10 @@ switch(length)
+ case 4:
+ if ((ranges[1] - ranges[0]) == (ranges[3] - ranges[2])
+ && (ranges[0] | (ranges[2] - ranges[0])) == ranges[2]
++ && (ranges[1] & (ranges[2] - ranges[0])) == 0
+ && is_powerof2(ranges[2] - ranges[0]))
+ {
++ SLJIT_ASSERT((ranges[0] & (ranges[2] - ranges[0])) == 0 && (ranges[2] & ranges[3] & (ranges[2] - ranges[0])) != 0);
+ OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2] - ranges[0]);
+ if (ranges[2] + 1 != ranges[3])
+ {
+@@ -4900,9 +4933,10 @@ else if ((cc[-1] & XCL_MAP) != 0)
+ if (!check_class_ranges(common, (const pcre_uint8 *)cc, FALSE, TRUE, list))
+ {
+ #ifdef COMPILE_PCRE8
+- SLJIT_ASSERT(common->utf);
++ jump = NULL;
++ if (common->utf)
+ #endif
+- jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
++ jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
+
+ OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
+ OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
+@@ -4911,7 +4945,10 @@ else if ((cc[-1] & XCL_MAP) != 0)
+ OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0);
+ add_jump(compiler, list, JUMP(SLJIT_NOT_ZERO));
+
+- JUMPHERE(jump);
++#ifdef COMPILE_PCRE8
++ if (common->utf)
++#endif
++ JUMPHERE(jump);
+ }
+
+ OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
+@@ -5219,7 +5256,7 @@ while (*cc != XCL_END)
+ OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL);
+
+ SET_CHAR_OFFSET(0);
+- OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xff);
++ OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x7f);
+ OP_FLAGS(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL);
+
+ SET_TYPE_OFFSET(ucp_Pc);
+@@ -7665,6 +7702,10 @@ while (*cc != OP_KETRPOS)
+ OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0);
+ }
+
++ /* Even if the match is empty, we need to reset the control head. */
++ if (needs_control_head)
++ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack));
++
+ if (opcode == OP_SBRAPOS || opcode == OP_SCBRAPOS)
+ add_jump(compiler, &emptymatch, CMP(SLJIT_EQUAL, TMP1, 0, STR_PTR, 0));
+
+@@ -7692,6 +7733,10 @@ while (*cc != OP_KETRPOS)
+ OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), (framesize + 1) * sizeof(sljit_sw), STR_PTR, 0);
+ }
+
++ /* Even if the match is empty, we need to reset the control head. */
++ if (needs_control_head)
++ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack));
++
+ if (opcode == OP_SBRAPOS || opcode == OP_SCBRAPOS)
+ add_jump(compiler, &emptymatch, CMP(SLJIT_EQUAL, TMP1, 0, STR_PTR, 0));
+
+@@ -7704,9 +7749,6 @@ while (*cc != OP_KETRPOS)
+ }
+ }
+
+- if (needs_control_head)
+- OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack));
+-
+ JUMPTO(SLJIT_JUMP, loop);
+ flush_stubs(common);
+
+@@ -8441,8 +8483,7 @@ while (cc < ccend)
+ OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), STR_PTR, 0);
+ }
+ BACKTRACK_AS(braminzero_backtrack)->matchingpath = LABEL();
+- if (cc[1] > OP_ASSERTBACK_NOT)
+- count_match(common);
++ count_match(common);
+ break;
+
+ case OP_ONCE:
+@@ -9624,7 +9665,7 @@ static SLJIT_INLINE void compile_recurse(compiler_common *common)
+ DEFINE_COMPILER;
+ pcre_uchar *cc = common->start + common->currententry->start;
+ pcre_uchar *ccbegin = cc + 1 + LINK_SIZE + (*cc == OP_BRA ? 0 : IMM2_SIZE);
+-pcre_uchar *ccend = bracketend(cc);
++pcre_uchar *ccend = bracketend(cc) - (1 + LINK_SIZE);
+ BOOL needs_control_head;
+ int framesize = get_framesize(common, cc, NULL, TRUE, &needs_control_head);
+ int private_data_size = get_private_data_copy_length(common, ccbegin, ccend, needs_control_head);
+@@ -9648,6 +9689,7 @@ set_jumps(common->currententry->calls, common->currententry->entry);
+
+ sljit_emit_fast_enter(compiler, TMP2, 0);
+ allocate_stack(common, private_data_size + framesize + alternativesize);
++count_match(common);
+ OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(private_data_size + framesize + alternativesize - 1), TMP2, 0);
+ copy_private_data(common, ccbegin, ccend, TRUE, private_data_size + framesize + alternativesize, framesize + alternativesize, needs_control_head);
+ if (needs_control_head)
+@@ -9992,6 +10034,7 @@ OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, stack));
+ OP1(SLJIT_MOV_UI, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, limit_match));
+ OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(struct sljit_stack, base));
+ OP1(SLJIT_MOV, STACK_LIMIT, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(struct sljit_stack, limit));
++OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LIMIT_MATCH, TMP1, 0);
+
+ if (mode == JIT_PARTIAL_SOFT_COMPILE)
+diff --git a/ext/pcre/pcrelib/pcre_study.c b/ext/pcre/pcrelib/pcre_study.c
+index 998fe23..7fd0ba0 100644
+--- a/ext/pcre/pcrelib/pcre_study.c
++++ b/ext/pcre/pcrelib/pcre_study.c
+@@ -71,6 +71,7 @@ rather than bytes.
+ startcode pointer to start of the whole pattern's code
+ options the compiling options
+ recurses chain of recurse_check to catch mutual recursion
++ countptr pointer to call count (to catch over complexity)
+
+ Returns: the minimum length
+ -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
+@@ -80,7 +81,8 @@ Returns: the minimum length
+
+ static int
+ find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
+- const pcre_uchar *startcode, int options, recurse_check *recurses)
++ const pcre_uchar *startcode, int options, recurse_check *recurses,
++ int *countptr)
+ {
+ int length = -1;
+ /* PCRE_UTF16 has the same value as PCRE_UTF8. */
+@@ -90,6 +92,8 @@ recurse_check this_recurse;
+ register int branchlength = 0;
+ register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
+
++if ((*countptr)++ > 1000) return -1; /* too complex */
++
+ if (*code == OP_CBRA || *code == OP_SCBRA ||
+ *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
+
+@@ -131,7 +135,7 @@ for (;;)
+ case OP_SBRAPOS:
+ case OP_ONCE:
+ case OP_ONCE_NC:
+- d = find_minlength(re, cc, startcode, options, recurses);
++ d = find_minlength(re, cc, startcode, options, recurses, countptr);
+ if (d < 0) return d;
+ branchlength += d;
+ do cc += GET(cc, 1); while (*cc == OP_ALT);
+@@ -415,7 +419,8 @@ for (;;)
+ int dd;
+ this_recurse.prev = recurses;
+ this_recurse.group = cs;
+- dd = find_minlength(re, cs, startcode, options, &this_recurse);
++ dd = find_minlength(re, cs, startcode, options, &this_recurse,
++ countptr);
+ if (dd < d) d = dd;
+ }
+ }
+@@ -451,7 +456,8 @@ for (;;)
+ {
+ this_recurse.prev = recurses;
+ this_recurse.group = cs;
+- d = find_minlength(re, cs, startcode, options, &this_recurse);
++ d = find_minlength(re, cs, startcode, options, &this_recurse,
++ countptr);
+ }
+ }
+ }
+@@ -514,7 +520,7 @@ for (;;)
+ this_recurse.prev = recurses;
+ this_recurse.group = cs;
+ branchlength += find_minlength(re, cs, startcode, options,
+- &this_recurse);
++ &this_recurse, countptr);
+ }
+ }
+ cc += 1 + LINK_SIZE;
+@@ -1453,6 +1459,7 @@ pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
+ #endif
+ {
+ int min;
++int count = 0;
+ BOOL bits_set = FALSE;
+ pcre_uint8 start_bits[32];
+ PUBL(extra) *extra = NULL;
+@@ -1539,7 +1546,7 @@ if ((re->options & PCRE_ANCHORED) == 0 &&
+
+ /* Find the minimum length of subject string. */
+
+-switch(min = find_minlength(re, code, code, re->options, NULL))
++switch(min = find_minlength(re, code, code, re->options, NULL, &count))
+ {
+ case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
+ case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
+diff --git a/ext/pcre/pcrelib/pcre_xclass.c b/ext/pcre/pcrelib/pcre_xclass.c
+index c2b61f0..ef759a5 100644
+--- a/ext/pcre/pcrelib/pcre_xclass.c
++++ b/ext/pcre/pcrelib/pcre_xclass.c
+@@ -246,7 +246,7 @@ while ((t = *data++) != XCL_END)
+
+ case PT_PXPUNCT:
+ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
+- (c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
++ (c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
+ return !negated;
+ break;
+
+diff --git a/ext/pcre/pcrelib/sljit/sljitConfig.h b/ext/pcre/pcrelib/sljit/sljitConfig.h
+index 10364c3..1c8a521 100644
+--- a/ext/pcre/pcrelib/sljit/sljitConfig.h
++++ b/ext/pcre/pcrelib/sljit/sljitConfig.h
+@@ -96,6 +96,15 @@
+ #define SLJIT_EXECUTABLE_ALLOCATOR 1
+ #endif
+
++/* Force cdecl calling convention even if a better calling
++ convention (e.g. fastcall) is supported by the C compiler.
++ If this option is enabled, C functions without
++ SLJIT_CALL can also be called from JIT code. */
++#ifndef SLJIT_USE_CDECL_CALLING_CONVENTION
++/* Disabled by default */
++#define SLJIT_USE_CDECL_CALLING_CONVENTION 0
++#endif
++
+ /* Return with error when an invalid argument is passed. */
+ #ifndef SLJIT_ARGUMENT_CHECKS
+ /* Disabled by default */
+diff --git a/ext/pcre/pcrelib/sljit/sljitConfigInternal.h b/ext/pcre/pcrelib/sljit/sljitConfigInternal.h
+index 3284012..16e3547 100644
+--- a/ext/pcre/pcrelib/sljit/sljitConfigInternal.h
++++ b/ext/pcre/pcrelib/sljit/sljitConfigInternal.h
+@@ -468,7 +468,12 @@ typedef double sljit_d;
+
+ #ifndef SLJIT_CALL
+
+-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
++#if (defined SLJIT_USE_CDECL_CALLING_CONVENTION && SLJIT_USE_CDECL_CALLING_CONVENTION)
++
++/* Force cdecl. */
++#define SLJIT_CALL
++
++#elif (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+
+ #if defined(__GNUC__) && !defined(__APPLE__)
+
+@@ -608,6 +613,12 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void);
+ #define SLJIT_LOCALS_OFFSET_BASE ((23 + 1) * sizeof(sljit_sw))
+ #endif
+
++#elif (defined SLJIT_CONFIG_TILEGX && SLJIT_CONFIG_TILEGX)
++
++#define SLJIT_NUMBER_OF_REGISTERS 10
++#define SLJIT_NUMBER_OF_SAVED_REGISTERS 5
++#define SLJIT_LOCALS_OFFSET_BASE 0
++
+ #elif (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED)
+
+ #define SLJIT_NUMBER_OF_REGISTERS 0
+diff --git a/ext/pcre/pcrelib/sljit/sljitLir.c b/ext/pcre/pcrelib/sljit/sljitLir.c
+index 5039a7e..0f1b1c9 100644
+--- a/ext/pcre/pcrelib/sljit/sljitLir.c
++++ b/ext/pcre/pcrelib/sljit/sljitLir.c
+@@ -845,8 +845,8 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_compiler_verbose(struct sljit_compiler *comp
+ }
+
+ static SLJIT_CONST char* op0_names[] = {
+- (char*)"breakpoint", (char*)"nop",
+- (char*)"lumul", (char*)"lsmul", (char*)"ludiv", (char*)"lsdiv",
++ (char*)"breakpoint", (char*)"nop", (char*)"lumul", (char*)"lsmul",
++ (char*)"udivmod", (char*)"sdivmod", (char*)"udivi", (char*)"sdivi"
+ };
+
+ static SLJIT_CONST char* op1_names[] = {
+@@ -1036,7 +1036,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op0(struct sljit_compiler
+ {
+ #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+ CHECK_ARGUMENT((op >= SLJIT_BREAKPOINT && op <= SLJIT_LSMUL)
+- || ((op & ~SLJIT_INT_OP) >= SLJIT_LUDIV && (op & ~SLJIT_INT_OP) <= SLJIT_LSDIV));
++ || ((op & ~SLJIT_INT_OP) >= SLJIT_UDIVMOD && (op & ~SLJIT_INT_OP) <= SLJIT_SDIVI));
+ CHECK_ARGUMENT(op < SLJIT_LUMUL || compiler->scratches >= 2);
+ #endif
+ #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+@@ -1447,6 +1447,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_flags(struct sljit_com
+
+ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_local_base(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw offset)
+ {
++ SLJIT_UNUSED_ARG(offset);
++
+ #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+ FUNCTION_CHECK_DST(dst, dstw);
+ #endif
+@@ -1462,6 +1464,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_local_base(struct sljit_co
+
+ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value)
+ {
++ SLJIT_UNUSED_ARG(init_value);
++
+ #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+ FUNCTION_CHECK_DST(dst, dstw);
+ #endif
+diff --git a/ext/pcre/pcrelib/sljit/sljitLir.h b/ext/pcre/pcrelib/sljit/sljitLir.h
+index 24c0f60..2e2e9ac09 100644
+--- a/ext/pcre/pcrelib/sljit/sljitLir.h
++++ b/ext/pcre/pcrelib/sljit/sljitLir.h
+@@ -687,7 +687,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler *
+ #define SLJIT_OP0_BASE 0
+
+ /* Flags: - (never set any flags)
+- Note: breakpoint instruction is not supported by all architectures (namely ppc)
++ Note: breakpoint instruction is not supported by all architectures (e.g. ppc)
+ It falls back to SLJIT_NOP in those cases. */
+ #define SLJIT_BREAKPOINT (SLJIT_OP0_BASE + 0)
+ /* Flags: - (never set any flags)
+@@ -696,24 +696,42 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler *
+ #define SLJIT_NOP (SLJIT_OP0_BASE + 1)
+ /* Flags: - (may destroy flags)
+ Unsigned multiplication of SLJIT_R0 and SLJIT_R1.
+- Result goes to SLJIT_R1:SLJIT_R0 (high:low) word */
++ Result is placed into SLJIT_R1:SLJIT_R0 (high:low) word */
+ #define SLJIT_LUMUL (SLJIT_OP0_BASE + 2)
+ /* Flags: - (may destroy flags)
+ Signed multiplication of SLJIT_R0 and SLJIT_R1.
+- Result goes to SLJIT_R1:SLJIT_R0 (high:low) word */
++ Result is placed into SLJIT_R1:SLJIT_R0 (high:low) word */
+ #define SLJIT_LSMUL (SLJIT_OP0_BASE + 3)
+ /* Flags: I - (may destroy flags)
+ Unsigned divide of the value in SLJIT_R0 by the value in SLJIT_R1.
+- The result is placed in SLJIT_R0 and the remainder goes to SLJIT_R1.
+- Note: if SLJIT_R1 contains 0, the behaviour is undefined. */
+-#define SLJIT_LUDIV (SLJIT_OP0_BASE + 4)
+-#define SLJIT_ILUDIV (SLJIT_LUDIV | SLJIT_INT_OP)
++ The result is placed into SLJIT_R0 and the remainder into SLJIT_R1.
++ Note: if SLJIT_R1 is 0, the behaviour is undefined. */
++#define SLJIT_UDIVMOD (SLJIT_OP0_BASE + 4)
++#define SLJIT_IUDIVMOD (SLJIT_UDIVMOD | SLJIT_INT_OP)
+ /* Flags: I - (may destroy flags)
+ Signed divide of the value in SLJIT_R0 by the value in SLJIT_R1.
+- The result is placed in SLJIT_R0 and the remainder goes to SLJIT_R1.
+- Note: if SLJIT_R1 contains 0, the behaviour is undefined. */
+-#define SLJIT_LSDIV (SLJIT_OP0_BASE + 5)
+-#define SLJIT_ILSDIV (SLJIT_LSDIV | SLJIT_INT_OP)
++ The result is placed into SLJIT_R0 and the remainder into SLJIT_R1.
++ Note: if SLJIT_R1 is 0, the behaviour is undefined.
++ Note: if SLJIT_R1 is -1 and SLJIT_R0 is integer min (0x800..00),
++ the behaviour is undefined. */
++#define SLJIT_SDIVMOD (SLJIT_OP0_BASE + 5)
++#define SLJIT_ISDIVMOD (SLJIT_SDIVMOD | SLJIT_INT_OP)
++/* Flags: I - (may destroy flags)
++ Unsigned divide of the value in SLJIT_R0 by the value in SLJIT_R1.
++ The result is placed into SLJIT_R0. SLJIT_R1 preserves its value.
++ Note: if SLJIT_R1 is 0, the behaviour is undefined.
++ Note: SLJIT_SDIV is single precision divide. */
++#define SLJIT_UDIVI (SLJIT_OP0_BASE + 6)
++#define SLJIT_IUDIVI (SLJIT_UDIVI | SLJIT_INT_OP)
++/* Flags: I - (may destroy flags)
++ Signed divide of the value in SLJIT_R0 by the value in SLJIT_R1.
++ The result is placed into SLJIT_R0. SLJIT_R1 preserves its value.
++ Note: if SLJIT_R1 is 0, the behaviour is undefined.
++ Note: if SLJIT_R1 is -1 and SLJIT_R0 is integer min (0x800..00),
++ the behaviour is undefined.
++ Note: SLJIT_SDIV is single precision divide. */
++#define SLJIT_SDIVI (SLJIT_OP0_BASE + 7)
++#define SLJIT_ISDIVI (SLJIT_SDIVI | SLJIT_INT_OP)
+
+ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op);
+
+@@ -851,34 +869,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler
+ sljit_si src1, sljit_sw src1w,
+ sljit_si src2, sljit_sw src2w);
+
+-/* The following function is a helper function for sljit_emit_op_custom.
+- It returns with the real machine register index ( >=0 ) of any SLJIT_R,
+- SLJIT_S and SLJIT_SP registers.
+-
+- Note: it returns with -1 for virtual registers (only on x86-32). */
+-
+-SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg);
+-
+-/* The following function is a helper function for sljit_emit_op_custom.
+- It returns with the real machine register index of any SLJIT_FLOAT register.
+-
+- Note: the index is always an even number on ARM (except ARM-64), MIPS, and SPARC. */
+-
+-SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg);
+-
+-/* Any instruction can be inserted into the instruction stream by
+- sljit_emit_op_custom. It has a similar purpose as inline assembly.
+- The size parameter must match to the instruction size of the target
+- architecture:
+-
+- x86: 0 < size <= 15. The instruction argument can be byte aligned.
+- Thumb2: if size == 2, the instruction argument must be 2 byte aligned.
+- if size == 4, the instruction argument must be 4 byte aligned.
+- Otherwise: size must be 4 and instruction argument must be 4 byte aligned. */
+-
+-SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
+- void *instruction, sljit_si size);
+-
+ /* Returns with non-zero if fpu is available. */
+
+ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_is_fpu_available(void);
+@@ -1196,4 +1186,64 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_set_function_context(void** func_ptr, struct
+
+ #endif /* !(defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL) */
+
++/* --------------------------------------------------------------------- */
++/* CPU specific functions */
++/* --------------------------------------------------------------------- */
++
++/* The following function is a helper function for sljit_emit_op_custom.
++ It returns with the real machine register index ( >=0 ) of any SLJIT_R,
++ SLJIT_S and SLJIT_SP registers.
++
++ Note: it returns with -1 for virtual registers (only on x86-32). */
++
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg);
++
++/* The following function is a helper function for sljit_emit_op_custom.
++ It returns with the real machine register index of any SLJIT_FLOAT register.
++
++ Note: the index is always an even number on ARM (except ARM-64), MIPS, and SPARC. */
++
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg);
++
++/* Any instruction can be inserted into the instruction stream by
++ sljit_emit_op_custom. It has a similar purpose as inline assembly.
++ The size parameter must match to the instruction size of the target
++ architecture:
++
++ x86: 0 < size <= 15. The instruction argument can be byte aligned.
++ Thumb2: if size == 2, the instruction argument must be 2 byte aligned.
++ if size == 4, the instruction argument must be 4 byte aligned.
++ Otherwise: size must be 4 and instruction argument must be 4 byte aligned. */
++
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
++ void *instruction, sljit_si size);
++
++#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
++
++/* Returns with non-zero if sse2 is available. */
++
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_sse2_available(void);
++
++/* Returns with non-zero if cmov instruction is available. */
++
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_cmov_available(void);
++
++/* Emit a conditional mov instruction on x86 CPUs. This instruction
++ moves src to destination, if the condition is satisfied. Unlike
++ other arithmetic instructions, destination must be a register.
++ Before such instructions are emitted, cmov support should be
++ checked by sljit_x86_is_cmov_available function.
++ type must be between SLJIT_EQUAL and SLJIT_S_ORDERED
++ dst_reg must be a valid register and it can be combined
++ with SLJIT_INT_OP to perform 32 bit arithmetic
++ Flags: I - (never set any flags)
++ */
++
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_emit_cmov(struct sljit_compiler *compiler,
++ sljit_si type,
++ sljit_si dst_reg,
++ sljit_si src, sljit_sw srcw);
++
++#endif
++
+ #endif /* _SLJIT_LIR_H_ */
+diff --git a/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c b/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c
+index aca1d31..5cd4c71 100644
+--- a/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c
++++ b/ext/pcre/pcrelib/sljit/sljitNativeARM_32.c
+@@ -1833,18 +1833,33 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ | (reg_map[SLJIT_R0] << 8)
+ | reg_map[TMP_REG1]);
+ #endif
+- case SLJIT_LUDIV:
+- case SLJIT_LSDIV:
+- if (compiler->scratches >= 3)
++ case SLJIT_UDIVMOD:
++ case SLJIT_SDIVMOD:
++ case SLJIT_UDIVI:
++ case SLJIT_SDIVI:
++ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
++ SLJIT_COMPILE_ASSERT(reg_map[2] == 1 && reg_map[3] == 2, bad_register_mapping);
++
++ if ((op >= SLJIT_UDIVI) && (compiler->scratches >= 3)) {
+ FAIL_IF(push_inst(compiler, 0xe52d2008 /* str r2, [sp, #-8]! */));
++ FAIL_IF(push_inst(compiler, 0xe58d1004 /* str r1, [sp, #4] */));
++ }
++ else if ((op >= SLJIT_UDIVI) || (compiler->scratches >= 3))
++ FAIL_IF(push_inst(compiler, 0xe52d0008 | (op >= SLJIT_UDIVI ? 0x1000 : 0x2000) /* str r1/r2, [sp, #-8]! */));
++
+ #if defined(__GNUC__)
+ FAIL_IF(sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM,
+- (op == SLJIT_LUDIV ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod))));
++ ((op | 0x2) == SLJIT_UDIVI ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod))));
+ #else
+ #error "Software divmod functions are needed"
+ #endif
+- if (compiler->scratches >= 3)
+- return push_inst(compiler, 0xe49d2008 /* ldr r2, [sp], #8 */);
++
++ if ((op >= SLJIT_UDIVI) && (compiler->scratches >= 3)) {
++ FAIL_IF(push_inst(compiler, 0xe59d1004 /* ldr r1, [sp, #4] */));
++ FAIL_IF(push_inst(compiler, 0xe49d2008 /* ldr r2, [sp], #8 */));
++ }
++ else if ((op >= SLJIT_UDIVI) || (compiler->scratches >= 3))
++ return push_inst(compiler, 0xe49d0008 | (op >= SLJIT_UDIVI ? 0x1000 : 0x2000) /* ldr r1/r2, [sp], #8 */);
+ return SLJIT_SUCCESS;
+ }
+
+diff --git a/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c b/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c
+index b66455f..044a675 100644
+--- a/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c
++++ b/ext/pcre/pcrelib/sljit/sljitNativeARM_64.c
+@@ -1087,14 +1087,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil
+ saved_regs_size += sizeof(sljit_sw);
+ }
+ local_size -= saved_regs_size + SLJIT_LOCALS_OFFSET;
+- FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10)));
++ if (saved_regs_size > 0)
++ FAIL_IF(push_inst(compiler, SUBI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10)));
+ }
+
+ tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG;
+ prev = -1;
+ for (i = SLJIT_S0; i >= tmp; i--) {
+ if (prev == -1) {
+- prev = i;
++ if (!(offs & (1 << 15))) {
++ prev = i;
++ continue;
++ }
++ FAIL_IF(push_inst(compiler, STRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
++ offs += 1 << 15;
+ continue;
+ }
+ FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+@@ -1104,7 +1110,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil
+
+ for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
+ if (prev == -1) {
+- prev = i;
++ if (!(offs & (1 << 15))) {
++ prev = i;
++ continue;
++ }
++ FAIL_IF(push_inst(compiler, STRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
++ offs += 1 << 15;
+ continue;
+ }
+ FAIL_IF(push_inst(compiler, STP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+@@ -1112,8 +1123,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil
+ prev = -1;
+ }
+
+- if (prev != -1)
+- FAIL_IF(push_inst(compiler, STRI | RT(prev) | RN(TMP_SP) | (offs >> 5)));
++ SLJIT_ASSERT(prev == -1);
+
+ if (compiler->local_size > (63 * sizeof(sljit_sw))) {
+ /* The local_size is already adjusted by the saved registers. */
+@@ -1188,7 +1198,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi
+ prev = -1;
+ for (i = SLJIT_S0; i >= tmp; i--) {
+ if (prev == -1) {
+- prev = i;
++ if (!(offs & (1 << 15))) {
++ prev = i;
++ continue;
++ }
++ FAIL_IF(push_inst(compiler, LDRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
++ offs += 1 << 15;
+ continue;
+ }
+ FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+@@ -1198,7 +1213,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi
+
+ for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
+ if (prev == -1) {
+- prev = i;
++ if (!(offs & (1 << 15))) {
++ prev = i;
++ continue;
++ }
++ FAIL_IF(push_inst(compiler, LDRI | RT(i) | RN(TMP_SP) | (offs >> 5)));
++ offs += 1 << 15;
+ continue;
+ }
+ FAIL_IF(push_inst(compiler, LDP | RT(prev) | RT2(i) | RN(TMP_SP) | offs));
+@@ -1206,13 +1226,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi
+ prev = -1;
+ }
+
+- if (prev != -1)
+- FAIL_IF(push_inst(compiler, LDRI | RT(prev) | RN(TMP_SP) | (offs >> 5)));
++ SLJIT_ASSERT(prev == -1);
+
+ if (compiler->local_size <= (63 * sizeof(sljit_sw))) {
+ FAIL_IF(push_inst(compiler, LDP_PST | 29 | RT2(TMP_LR)
+ | RN(TMP_SP) | (((local_size >> 3) & 0x7f) << 15)));
+- } else {
++ } else if (saved_regs_size > 0) {
+ FAIL_IF(push_inst(compiler, ADDI | RD(TMP_SP) | RN(TMP_SP) | (saved_regs_size << 10)));
+ }
+
+@@ -1242,12 +1261,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ FAIL_IF(push_inst(compiler, ORR | RD(TMP_REG1) | RN(TMP_ZERO) | RM(SLJIT_R0)));
+ FAIL_IF(push_inst(compiler, MADD | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1) | RT2(TMP_ZERO)));
+ return push_inst(compiler, (op == SLJIT_LUMUL ? UMULH : SMULH) | RD(SLJIT_R1) | RN(TMP_REG1) | RM(SLJIT_R1));
+- case SLJIT_LUDIV:
+- case SLJIT_LSDIV:
++ case SLJIT_UDIVMOD:
++ case SLJIT_SDIVMOD:
+ FAIL_IF(push_inst(compiler, (ORR ^ inv_bits) | RD(TMP_REG1) | RN(TMP_ZERO) | RM(SLJIT_R0)));
+- FAIL_IF(push_inst(compiler, ((op == SLJIT_LUDIV ? UDIV : SDIV) ^ inv_bits) | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1)));
++ FAIL_IF(push_inst(compiler, ((op == SLJIT_UDIVMOD ? UDIV : SDIV) ^ inv_bits) | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1)));
+ FAIL_IF(push_inst(compiler, (MADD ^ inv_bits) | RD(SLJIT_R1) | RN(SLJIT_R0) | RM(SLJIT_R1) | RT2(TMP_ZERO)));
+ return push_inst(compiler, (SUB ^ inv_bits) | RD(SLJIT_R1) | RN(TMP_REG1) | RM(SLJIT_R1));
++ case SLJIT_UDIVI:
++ case SLJIT_SDIVI:
++ return push_inst(compiler, ((op == SLJIT_UDIVI ? UDIV : SDIV) ^ inv_bits) | RD(SLJIT_R0) | RN(SLJIT_R0) | RM(SLJIT_R1));
+ }
+
+ return SLJIT_SUCCESS;
+diff --git a/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c b/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c
+index 6e38cec..f9803f5 100644
+--- a/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c
++++ b/ext/pcre/pcrelib/sljit/sljitNativeARM_T2_32.c
+@@ -1239,6 +1239,9 @@ extern int __aeabi_idivmod(int numerator, int denominator);
+
+ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op)
+ {
++ sljit_sw saved_reg_list[3];
++ sljit_sw saved_reg_count;
++
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_op0(compiler, op));
+
+@@ -1255,24 +1258,53 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ | (reg_map[SLJIT_R0] << 12)
+ | (reg_map[SLJIT_R0] << 16)
+ | reg_map[SLJIT_R1]);
+- case SLJIT_LUDIV:
+- case SLJIT_LSDIV:
+- if (compiler->scratches >= 4) {
+- FAIL_IF(push_inst32(compiler, 0xf84d2d04 /* str r2, [sp, #-4]! */));
+- FAIL_IF(push_inst32(compiler, 0xf84dcd04 /* str ip, [sp, #-4]! */));
+- } else if (compiler->scratches >= 3)
+- FAIL_IF(push_inst32(compiler, 0xf84d2d08 /* str r2, [sp, #-8]! */));
++ case SLJIT_UDIVMOD:
++ case SLJIT_SDIVMOD:
++ case SLJIT_UDIVI:
++ case SLJIT_SDIVI:
++ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
++ SLJIT_COMPILE_ASSERT(reg_map[2] == 1 && reg_map[3] == 2 && reg_map[4] == 12, bad_register_mapping);
++
++ saved_reg_count = 0;
++ if (compiler->scratches >= 4)
++ saved_reg_list[saved_reg_count++] = 12;
++ if (compiler->scratches >= 3)
++ saved_reg_list[saved_reg_count++] = 2;
++ if (op >= SLJIT_UDIVI)
++ saved_reg_list[saved_reg_count++] = 1;
++
++ if (saved_reg_count > 0) {
++ FAIL_IF(push_inst32(compiler, 0xf84d0d00 | (saved_reg_count >= 3 ? 16 : 8)
++ | (saved_reg_list[0] << 12) /* str rX, [sp, #-8/-16]! */));
++ if (saved_reg_count >= 2) {
++ SLJIT_ASSERT(saved_reg_list[1] < 8);
++ FAIL_IF(push_inst16(compiler, 0x9001 | (saved_reg_list[1] << 8) /* str rX, [sp, #4] */));
++ }
++ if (saved_reg_count >= 3) {
++ SLJIT_ASSERT(saved_reg_list[2] < 8);
++ FAIL_IF(push_inst16(compiler, 0x9002 | (saved_reg_list[2] << 8) /* str rX, [sp, #8] */));
++ }
++ }
++
+ #if defined(__GNUC__)
+ FAIL_IF(sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM,
+- (op == SLJIT_LUDIV ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod))));
++ ((op | 0x2) == SLJIT_UDIVI ? SLJIT_FUNC_OFFSET(__aeabi_uidivmod) : SLJIT_FUNC_OFFSET(__aeabi_idivmod))));
+ #else
+ #error "Software divmod functions are needed"
+ #endif
+- if (compiler->scratches >= 4) {
+- FAIL_IF(push_inst32(compiler, 0xf85dcb04 /* ldr ip, [sp], #4 */));
+- return push_inst32(compiler, 0xf85d2b04 /* ldr r2, [sp], #4 */);
+- } else if (compiler->scratches >= 3)
+- return push_inst32(compiler, 0xf85d2b08 /* ldr r2, [sp], #8 */);
++
++ if (saved_reg_count > 0) {
++ if (saved_reg_count >= 3) {
++ SLJIT_ASSERT(saved_reg_list[2] < 8);
++ FAIL_IF(push_inst16(compiler, 0x9802 | (saved_reg_list[2] << 8) /* ldr rX, [sp, #8] */));
++ }
++ if (saved_reg_count >= 2) {
++ SLJIT_ASSERT(saved_reg_list[1] < 8);
++ FAIL_IF(push_inst16(compiler, 0x9801 | (saved_reg_list[1] << 8) /* ldr rX, [sp, #4] */));
++ }
++ return push_inst32(compiler, 0xf85d0b00 | (saved_reg_count >= 3 ? 16 : 8)
++ | (saved_reg_list[0] << 12) /* ldr rX, [sp], #8/16 */);
++ }
+ return SLJIT_SUCCESS;
+ }
+
+diff --git a/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c b/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c
+index 3e2c9f0..cf3535f 100644
+--- a/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c
++++ b/ext/pcre/pcrelib/sljit/sljitNativeMIPS_common.c
+@@ -1053,8 +1053,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ #endif
+ FAIL_IF(push_inst(compiler, MFLO | D(SLJIT_R0), DR(SLJIT_R0)));
+ return push_inst(compiler, MFHI | D(SLJIT_R1), DR(SLJIT_R1));
+- case SLJIT_LUDIV:
+- case SLJIT_LSDIV:
++ case SLJIT_UDIVMOD:
++ case SLJIT_SDIVMOD:
++ case SLJIT_UDIVI:
++ case SLJIT_SDIVI:
++ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
+ #if !(defined SLJIT_MIPS_R1 && SLJIT_MIPS_R1)
+ FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+ FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+@@ -1062,15 +1065,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+
+ #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+ if (int_op)
+- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
++ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
+ else
+- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DDIVU : DDIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
++ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? DDIVU : DDIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
+ #else
+- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
++ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? DIVU : DIV) | S(SLJIT_R0) | T(SLJIT_R1), MOVABLE_INS));
+ #endif
+
+ FAIL_IF(push_inst(compiler, MFLO | D(SLJIT_R0), DR(SLJIT_R0)));
+- return push_inst(compiler, MFHI | D(SLJIT_R1), DR(SLJIT_R1));
++ return (op >= SLJIT_UDIVI) ? SLJIT_SUCCESS : push_inst(compiler, MFHI | D(SLJIT_R1), DR(SLJIT_R1));
+ }
+
+ return SLJIT_SUCCESS;
+diff --git a/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c b/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c
+index 08d5356..b6a043f 100644
+--- a/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c
++++ b/ext/pcre/pcrelib/sljit/sljitNativePPC_common.c
+@@ -1267,22 +1267,23 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ FAIL_IF(push_inst(compiler, MULLW | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1)));
+ return push_inst(compiler, (op == SLJIT_LUMUL ? MULHWU : MULHW) | D(SLJIT_R1) | A(TMP_REG1) | B(SLJIT_R1));
+ #endif
+- case SLJIT_LUDIV:
+- case SLJIT_LSDIV:
++ case SLJIT_UDIVMOD:
++ case SLJIT_SDIVMOD:
+ FAIL_IF(push_inst(compiler, OR | S(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R0)));
+ #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+- if (int_op) {
+- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVWU : DIVW) | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1)));
+- FAIL_IF(push_inst(compiler, MULLW | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1)));
+- } else {
+- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVDU : DIVD) | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1)));
+- FAIL_IF(push_inst(compiler, MULLD | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1)));
+- }
+- return push_inst(compiler, SUBF | D(SLJIT_R1) | A(SLJIT_R1) | B(TMP_REG1));
++ FAIL_IF(push_inst(compiler, (int_op ? (op == SLJIT_UDIVMOD ? DIVWU : DIVW) : (op == SLJIT_UDIVMOD ? DIVDU : DIVD)) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1)));
++ FAIL_IF(push_inst(compiler, (int_op ? MULLW : MULLD) | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1)));
+ #else
+- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? DIVWU : DIVW) | D(SLJIT_R0) | A(TMP_REG1) | B(SLJIT_R1)));
++ FAIL_IF(push_inst(compiler, (op == SLJIT_UDIVMOD ? DIVWU : DIVW) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1)));
+ FAIL_IF(push_inst(compiler, MULLW | D(SLJIT_R1) | A(SLJIT_R0) | B(SLJIT_R1)));
++#endif
+ return push_inst(compiler, SUBF | D(SLJIT_R1) | A(SLJIT_R1) | B(TMP_REG1));
++ case SLJIT_UDIVI:
++ case SLJIT_SDIVI:
++#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
++ return push_inst(compiler, (int_op ? (op == SLJIT_UDIVI ? DIVWU : DIVW) : (op == SLJIT_UDIVI ? DIVDU : DIVD)) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1));
++#else
++ return push_inst(compiler, (op == SLJIT_UDIVI ? DIVWU : DIVW) | D(SLJIT_R0) | A(SLJIT_R0) | B(SLJIT_R1));
+ #endif
+ }
+
+diff --git a/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c b/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c
+index 0b1927a..327c426 100644
+--- a/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c
++++ b/ext/pcre/pcrelib/sljit/sljitNativeSPARC_common.c
+@@ -777,20 +777,25 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ #else
+ #error "Implementation required"
+ #endif
+- case SLJIT_LUDIV:
+- case SLJIT_LSDIV:
++ case SLJIT_UDIVMOD:
++ case SLJIT_SDIVMOD:
++ case SLJIT_UDIVI:
++ case SLJIT_SDIVI:
++ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
+ #if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
+- if (op == SLJIT_LUDIV)
++ if ((op | 0x2) == SLJIT_UDIVI)
+ FAIL_IF(push_inst(compiler, WRY | S1(0), MOVABLE_INS));
+ else {
+ FAIL_IF(push_inst(compiler, SRA | D(TMP_REG1) | S1(SLJIT_R0) | IMM(31), DR(TMP_REG1)));
+ FAIL_IF(push_inst(compiler, WRY | S1(TMP_REG1), MOVABLE_INS));
+ }
+- FAIL_IF(push_inst(compiler, OR | D(TMP_REG2) | S1(0) | S2(SLJIT_R0), DR(TMP_REG2)));
+- FAIL_IF(push_inst(compiler, (op == SLJIT_LUDIV ? UDIV : SDIV) | D(SLJIT_R0) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R0)));
++ if (op <= SLJIT_SDIVMOD)
++ FAIL_IF(push_inst(compiler, OR | D(TMP_REG2) | S1(0) | S2(SLJIT_R0), DR(TMP_REG2)));
++ FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_UDIVI ? UDIV : SDIV) | D(SLJIT_R0) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R0)));
++ if (op >= SLJIT_UDIVI)
++ return SLJIT_SUCCESS;
+ FAIL_IF(push_inst(compiler, SMUL | D(SLJIT_R1) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R1)));
+- FAIL_IF(push_inst(compiler, SUB | D(SLJIT_R1) | S1(TMP_REG2) | S2(SLJIT_R1), DR(SLJIT_R1)));
+- return SLJIT_SUCCESS;
++ return push_inst(compiler, SUB | D(SLJIT_R1) | S1(TMP_REG2) | S2(SLJIT_R1), DR(SLJIT_R1));
+ #else
+ #error "Implementation required"
+ #endif
+diff --git a/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c b/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c
+index 1d6aa5a..4d40392f 100644
+--- a/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c
++++ b/ext/pcre/pcrelib/sljit/sljitNativeTILEGX_64.c
+@@ -35,21 +35,21 @@
+ #define SIMM_16BIT_MIN (-0x8000)
+ #define SIMM_17BIT_MAX (0xffff)
+ #define SIMM_17BIT_MIN (-0x10000)
+-#define SIMM_32BIT_MIN (-0x80000000)
+ #define SIMM_32BIT_MAX (0x7fffffff)
+-#define SIMM_48BIT_MIN (0x800000000000L)
++#define SIMM_32BIT_MIN (-0x7fffffff - 1)
+ #define SIMM_48BIT_MAX (0x7fffffff0000L)
++#define SIMM_48BIT_MIN (-0x800000000000L)
+ #define IMM16(imm) ((imm) & 0xffff)
+
+ #define UIMM_16BIT_MAX (0xffff)
+
+-#define TMP_REG1 (SLJIT_NO_REGISTERS + 1)
+-#define TMP_REG2 (SLJIT_NO_REGISTERS + 2)
+-#define TMP_REG3 (SLJIT_NO_REGISTERS + 3)
+-#define ADDR_TMP (SLJIT_NO_REGISTERS + 4)
++#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2)
++#define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3)
++#define TMP_REG3 (SLJIT_NUMBER_OF_REGISTERS + 4)
++#define ADDR_TMP (SLJIT_NUMBER_OF_REGISTERS + 5)
+ #define PIC_ADDR_REG TMP_REG2
+
+-static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 5] = {
++static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 6] = {
+ 63, 0, 1, 2, 3, 4, 30, 31, 32, 33, 34, 54, 5, 16, 6, 7
+ };
+
+@@ -58,11 +58,6 @@ static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 5] = {
+ #define TMP_REG2_mapped 16
+ #define TMP_REG3_mapped 6
+ #define ADDR_TMP_mapped 7
+-#define SLJIT_SAVED_REG1_mapped 30
+-#define SLJIT_SAVED_REG2_mapped 31
+-#define SLJIT_SAVED_REG3_mapped 32
+-#define SLJIT_SAVED_EREG1_mapped 33
+-#define SLJIT_SAVED_EREG2_mapped 34
+
+ /* Flags are keept in volatile registers. */
+ #define EQUAL_FLAG 8
+@@ -399,6 +394,9 @@ static sljit_si push_inst(struct sljit_compiler *compiler, sljit_ins ins)
+ #define SUB(dst, srca, srcb) \
+ push_3_buffer(compiler, TILEGX_OPC_SUB, dst, srca, srcb, __LINE__)
+
++#define MUL(dst, srca, srcb) \
++ push_3_buffer(compiler, TILEGX_OPC_MULX, dst, srca, srcb, __LINE__)
++
+ #define NOR(dst, srca, srcb) \
+ push_3_buffer(compiler, TILEGX_OPC_NOR, dst, srca, srcb, __LINE__)
+
+@@ -547,8 +545,8 @@ const struct Format* compute_format()
+
+ const struct Format* match = NULL;
+ const struct Format *b = NULL;
+- unsigned int i = 0;
+- for (i; i < sizeof formats / sizeof formats[0]; i++) {
++ unsigned int i;
++ for (i = 0; i < sizeof formats / sizeof formats[0]; i++) {
+ b = &formats[i];
+ if ((b->pipe_mask & compatible_pipes) == b->pipe_mask) {
+ match = b;
+@@ -625,7 +623,6 @@ tilegx_bundle_bits get_bundle_bit(struct jit_instr *inst)
+
+ static sljit_si update_buffer(struct sljit_compiler *compiler)
+ {
+- int count;
+ int i;
+ int orig_index = inst_buf_index;
+ struct jit_instr inst0 = inst_buf[0];
+@@ -738,8 +735,10 @@ static sljit_si update_buffer(struct sljit_compiler *compiler)
+
+ static sljit_si flush_buffer(struct sljit_compiler *compiler)
+ {
+- while (inst_buf_index != 0)
+- update_buffer(compiler);
++ while (inst_buf_index != 0) {
++ FAIL_IF(update_buffer(compiler));
++ }
++ return SLJIT_SUCCESS;
+ }
+
+ static sljit_si push_4_buffer(struct sljit_compiler *compiler, tilegx_mnemonic opc, int op0, int op1, int op2, int op3, int line)
+@@ -787,6 +786,7 @@ static sljit_si push_3_buffer(struct sljit_compiler *compiler, tilegx_mnemonic o
+ case TILEGX_OPC_ADD:
+ case TILEGX_OPC_AND:
+ case TILEGX_OPC_SUB:
++ case TILEGX_OPC_MULX:
+ case TILEGX_OPC_OR:
+ case TILEGX_OPC_XOR:
+ case TILEGX_OPC_NOR:
+@@ -905,7 +905,6 @@ static SLJIT_INLINE sljit_ins * detect_jump_type(struct sljit_jump *jump, sljit_
+ sljit_sw diff;
+ sljit_uw target_addr;
+ sljit_ins *inst;
+- sljit_ins saved_inst;
+
+ if (jump->flags & SLJIT_REWRITABLE_JUMP)
+ return code_ptr;
+@@ -1009,7 +1008,7 @@ SLJIT_API_FUNC_ATTRIBUTE void * sljit_generate_code(struct sljit_compiler *compi
+ struct sljit_const *const_;
+
+ CHECK_ERROR_PTR();
+- check_sljit_generate_code(compiler);
++ CHECK_PTR(check_sljit_generate_code(compiler));
+ reverse_buf(compiler);
+
+ code = (sljit_ins *)SLJIT_MALLOC_EXEC(compiler->size * sizeof(sljit_ins));
+@@ -1178,13 +1177,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil
+ sljit_si fscratches, sljit_si fsaveds, sljit_si local_size)
+ {
+ sljit_ins base;
+- sljit_ins bundle = 0;
+-
++ sljit_si i, tmp;
++
+ CHECK_ERROR();
+- check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
++ CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
+ set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+
+- local_size += (saveds + 1) * sizeof(sljit_sw);
++ local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
+ local_size = (local_size + 7) & ~7;
+ compiler->local_size = local_size;
+
+@@ -1200,56 +1199,52 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_enter(struct sljit_compiler *compil
+ local_size = 0;
+ }
+
++ /* Save the return address. */
+ FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 8));
+ FAIL_IF(ST_ADD(ADDR_TMP_mapped, RA, -8));
+
+- if (saveds >= 1)
+- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_REG1_mapped, -8));
+-
+- if (saveds >= 2)
+- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_REG2_mapped, -8));
+-
+- if (saveds >= 3)
+- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_REG3_mapped, -8));
+-
+- if (saveds >= 4)
+- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_EREG1_mapped, -8));
+-
+- if (saveds >= 5)
+- FAIL_IF(ST_ADD(ADDR_TMP_mapped, SLJIT_SAVED_EREG2_mapped, -8));
+-
+- if (args >= 1)
+- FAIL_IF(ADD(SLJIT_SAVED_REG1_mapped, 0, ZERO));
++ /* Save the S registers. */
++ tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG;
++ for (i = SLJIT_S0; i >= tmp; i--) {
++ FAIL_IF(ST_ADD(ADDR_TMP_mapped, reg_map[i], -8));
++ }
+
+- if (args >= 2)
+- FAIL_IF(ADD(SLJIT_SAVED_REG2_mapped, 1, ZERO));
++ /* Save the R registers that need to be reserved. */
++ for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
++ FAIL_IF(ST_ADD(ADDR_TMP_mapped, reg_map[i], -8));
++ }
+
+- if (args >= 3)
+- FAIL_IF(ADD(SLJIT_SAVED_REG3_mapped, 2, ZERO));
++ /* Move the arguments to S registers. */
++ for (i = 0; i < args; i++) {
++ FAIL_IF(ADD(reg_map[SLJIT_S0 - i], i, ZERO));
++ }
+
+ return SLJIT_SUCCESS;
+ }
+
+-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_context(struct sljit_compiler *compiler,
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_set_context(struct sljit_compiler *compiler,
+ sljit_si options, sljit_si args, sljit_si scratches, sljit_si saveds,
+ sljit_si fscratches, sljit_si fsaveds, sljit_si local_size)
+ {
+- CHECK_ERROR_VOID();
+- check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
++ CHECK_ERROR();
++ CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
+ set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
+
+- local_size += (saveds + 1) * sizeof(sljit_sw);
++ local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
+ compiler->local_size = (local_size + 7) & ~7;
++
++ return SLJIT_SUCCESS;
+ }
+
+ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compiler, sljit_si op, sljit_si src, sljit_sw srcw)
+ {
+ sljit_si local_size;
+ sljit_ins base;
+- int addr_initialized = 0;
++ sljit_si i, tmp;
++ sljit_si saveds;
+
+ CHECK_ERROR();
+- check_sljit_emit_return(compiler, op, src, srcw);
++ CHECK(check_sljit_emit_return(compiler, op, src, srcw));
+
+ FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
+
+@@ -1263,50 +1258,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_return(struct sljit_compiler *compi
+ local_size = 0;
+ }
+
++ /* Restore the return address. */
+ FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 8));
+- FAIL_IF(LD(RA, ADDR_TMP_mapped));
+-
+- if (compiler->saveds >= 5) {
+- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 48));
+- addr_initialized = 1;
++ FAIL_IF(LD_ADD(RA, ADDR_TMP_mapped, -8));
+
+- FAIL_IF(LD_ADD(SLJIT_SAVED_EREG2_mapped, ADDR_TMP_mapped, 8));
++ /* Restore the S registers. */
++ saveds = compiler->saveds;
++ tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG;
++ for (i = SLJIT_S0; i >= tmp; i--) {
++ FAIL_IF(LD_ADD(reg_map[i], ADDR_TMP_mapped, -8));
+ }
+
+- if (compiler->saveds >= 4) {
+- if (addr_initialized == 0) {
+- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 40));
+- addr_initialized = 1;
+- }
+-
+- FAIL_IF(LD_ADD(SLJIT_SAVED_EREG1_mapped, ADDR_TMP_mapped, 8));
+- }
+-
+- if (compiler->saveds >= 3) {
+- if (addr_initialized == 0) {
+- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 32));
+- addr_initialized = 1;
+- }
+-
+- FAIL_IF(LD_ADD(SLJIT_SAVED_REG3_mapped, ADDR_TMP_mapped, 8));
+- }
+-
+- if (compiler->saveds >= 2) {
+- if (addr_initialized == 0) {
+- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 24));
+- addr_initialized = 1;
+- }
+-
+- FAIL_IF(LD_ADD(SLJIT_SAVED_REG2_mapped, ADDR_TMP_mapped, 8));
+- }
+-
+- if (compiler->saveds >= 1) {
+- if (addr_initialized == 0) {
+- FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 16));
+- /* addr_initialized = 1; no need to initialize as it's the last one. */
+- }
+-
+- FAIL_IF(LD_ADD(SLJIT_SAVED_REG1_mapped, ADDR_TMP_mapped, 8));
++ /* Restore the R registers that need to be reserved. */
++ for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
++ FAIL_IF(LD_ADD(reg_map[i], ADDR_TMP_mapped, -8));
+ }
+
+ if (compiler->local_size <= SIMM_16BIT_MAX)
+@@ -1585,7 +1550,7 @@ static SLJIT_INLINE sljit_si emit_op_mem2(struct sljit_compiler *compiler, sljit
+ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw)
+ {
+ CHECK_ERROR();
+- check_sljit_emit_fast_enter(compiler, dst, dstw);
++ CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
+ ADJUST_LOCAL_OFFSET(dst, dstw);
+
+ /* For UNUSED dst. Uncommon, but possible. */
+@@ -1602,7 +1567,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_enter(struct sljit_compiler *c
+ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_si src, sljit_sw srcw)
+ {
+ CHECK_ERROR();
+- check_sljit_emit_fast_return(compiler, src, srcw);
++ CHECK(check_sljit_emit_fast_return(compiler, src, srcw));
+ ADJUST_LOCAL_OFFSET(src, srcw);
+
+ if (FAST_IS_REG(src))
+@@ -1636,9 +1601,11 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
+ if (op == SLJIT_MOV_SI)
+ return BFEXTS(reg_map[dst], reg_map[src2], 0, 31);
+
+- return BFEXTU(reg_map[dst], reg_map[src2], 0, 31);
+- } else if (dst != src2)
+- SLJIT_ASSERT_STOP();
++ return BFEXTU(reg_map[dst], reg_map[src2], 0, 31);
++ } else if (dst != src2) {
++ SLJIT_ASSERT(src2 == 0);
++ return ADD(reg_map[dst], reg_map[src2], ZERO);
++ }
+
+ return SLJIT_SUCCESS;
+
+@@ -1650,8 +1617,10 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
+ return BFEXTS(reg_map[dst], reg_map[src2], 0, 7);
+
+ return BFEXTU(reg_map[dst], reg_map[src2], 0, 7);
+- } else if (dst != src2)
+- SLJIT_ASSERT_STOP();
++ } else if (dst != src2) {
++ SLJIT_ASSERT(src2 == 0);
++ return ADD(reg_map[dst], reg_map[src2], ZERO);
++ }
+
+ return SLJIT_SUCCESS;
+
+@@ -1663,8 +1632,10 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
+ return BFEXTS(reg_map[dst], reg_map[src2], 0, 15);
+
+ return BFEXTU(reg_map[dst], reg_map[src2], 0, 15);
+- } else if (dst != src2)
+- SLJIT_ASSERT_STOP();
++ } else if (dst != src2) {
++ SLJIT_ASSERT(src2 == 0);
++ return ADD(reg_map[dst], reg_map[src2], ZERO);
++ }
+
+ return SLJIT_SUCCESS;
+
+@@ -1811,7 +1782,6 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
+ else {
+ /* Rare ocasion. */
+ FAIL_IF(ADD(TMP_EREG2, reg_map[src1], ZERO));
+-
+ overflow_ra = TMP_EREG2;
+ }
+ }
+@@ -1903,6 +1873,17 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
+
+ return SLJIT_SUCCESS;
+
++ case SLJIT_MUL:
++ if (flags & SRC2_IMM) {
++ FAIL_IF(load_immediate(compiler, TMP_REG2_mapped, src2));
++ src2 = TMP_REG2;
++ flags &= ~SRC2_IMM;
++ }
++
++ FAIL_IF(MUL(reg_map[dst], reg_map[src1], reg_map[src2]));
++
++ return SLJIT_SUCCESS;
++
+ #define EMIT_LOGICAL(op_imm, op_norm) \
+ if (flags & SRC2_IMM) { \
+ FAIL_IF(load_immediate(compiler, ADDR_TMP_mapped, src2)); \
+@@ -1950,8 +1931,8 @@ static SLJIT_INLINE sljit_si emit_single_op(struct sljit_compiler *compiler, slj
+ } else { \
+ if (op & SLJIT_SET_E) \
+ FAIL_IF(push_3_buffer( \
+- compiler, op_imm, reg_map[dst], reg_map[src1], \
+- src2 & 0x3F, __LINE__)); \
++ compiler, op_norm, EQUAL_FLAG, reg_map[src1], \
++ reg_map[src2], __LINE__)); \
+ if (CHECK_FLAGS(SLJIT_SET_E)) \
+ FAIL_IF(push_3_buffer( \
+ compiler, op_norm, reg_map[dst], reg_map[src1], \
+@@ -2105,66 +2086,61 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *com
+ {
+ sljit_si sugg_dst_ar, dst_ar;
+ sljit_si flags = GET_ALL_FLAGS(op);
++ sljit_si mem_type = (op & SLJIT_INT_OP) ? (INT_DATA | SIGNED_DATA) : WORD_DATA;
+
+ CHECK_ERROR();
+- check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type);
++ CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
+ ADJUST_LOCAL_OFFSET(dst, dstw);
+
+ if (dst == SLJIT_UNUSED)
+ return SLJIT_SUCCESS;
+
+ op = GET_OPCODE(op);
++ if (op == SLJIT_MOV_SI || op == SLJIT_MOV_UI)
++ mem_type = INT_DATA | SIGNED_DATA;
+ sugg_dst_ar = reg_map[(op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2];
+
+ compiler->cache_arg = 0;
+ compiler->cache_argw = 0;
+ if (op >= SLJIT_ADD && (src & SLJIT_MEM)) {
+ ADJUST_LOCAL_OFFSET(src, srcw);
+- FAIL_IF(emit_op_mem2(compiler, WORD_DATA | LOAD_DATA, TMP_REG1_mapped, src, srcw, dst, dstw));
++ FAIL_IF(emit_op_mem2(compiler, mem_type | LOAD_DATA, TMP_REG1_mapped, src, srcw, dst, dstw));
+ src = TMP_REG1;
+ srcw = 0;
+ }
+
+- switch (type) {
+- case SLJIT_C_EQUAL:
+- case SLJIT_C_NOT_EQUAL:
++ switch (type & 0xff) {
++ case SLJIT_EQUAL:
++ case SLJIT_NOT_EQUAL:
+ FAIL_IF(CMPLTUI(sugg_dst_ar, EQUAL_FLAG, 1));
+ dst_ar = sugg_dst_ar;
+ break;
+- case SLJIT_C_LESS:
+- case SLJIT_C_GREATER_EQUAL:
+- case SLJIT_C_FLOAT_LESS:
+- case SLJIT_C_FLOAT_GREATER_EQUAL:
++ case SLJIT_LESS:
++ case SLJIT_GREATER_EQUAL:
+ dst_ar = ULESS_FLAG;
+ break;
+- case SLJIT_C_GREATER:
+- case SLJIT_C_LESS_EQUAL:
+- case SLJIT_C_FLOAT_GREATER:
+- case SLJIT_C_FLOAT_LESS_EQUAL:
++ case SLJIT_GREATER:
++ case SLJIT_LESS_EQUAL:
+ dst_ar = UGREATER_FLAG;
+ break;
+- case SLJIT_C_SIG_LESS:
+- case SLJIT_C_SIG_GREATER_EQUAL:
++ case SLJIT_SIG_LESS:
++ case SLJIT_SIG_GREATER_EQUAL:
+ dst_ar = LESS_FLAG;
+ break;
+- case SLJIT_C_SIG_GREATER:
+- case SLJIT_C_SIG_LESS_EQUAL:
++ case SLJIT_SIG_GREATER:
++ case SLJIT_SIG_LESS_EQUAL:
+ dst_ar = GREATER_FLAG;
+ break;
+- case SLJIT_C_OVERFLOW:
+- case SLJIT_C_NOT_OVERFLOW:
++ case SLJIT_OVERFLOW:
++ case SLJIT_NOT_OVERFLOW:
+ dst_ar = OVERFLOW_FLAG;
+ break;
+- case SLJIT_C_MUL_OVERFLOW:
+- case SLJIT_C_MUL_NOT_OVERFLOW:
++ case SLJIT_MUL_OVERFLOW:
++ case SLJIT_MUL_NOT_OVERFLOW:
+ FAIL_IF(CMPLTUI(sugg_dst_ar, OVERFLOW_FLAG, 1));
+ dst_ar = sugg_dst_ar;
+ type ^= 0x1; /* Flip type bit for the XORI below. */
+ break;
+- case SLJIT_C_FLOAT_EQUAL:
+- case SLJIT_C_FLOAT_NOT_EQUAL:
+- dst_ar = EQUAL_FLAG;
+- break;
+
+ default:
+ SLJIT_ASSERT_STOP();
+@@ -2180,11 +2156,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *com
+ if (op >= SLJIT_ADD) {
+ if (TMP_REG2_mapped != dst_ar)
+ FAIL_IF(ADD(TMP_REG2_mapped, dst_ar, ZERO));
+- return emit_op(compiler, op | flags, CUMULATIVE_OP | LOGICAL_OP | IMM_OP | ALT_KEEP_CACHE, dst, dstw, src, srcw, TMP_REG2, 0);
++ return emit_op(compiler, op | flags, mem_type | CUMULATIVE_OP | LOGICAL_OP | IMM_OP | ALT_KEEP_CACHE, dst, dstw, src, srcw, TMP_REG2, 0);
+ }
+
+ if (dst & SLJIT_MEM)
+- return emit_op_mem(compiler, WORD_DATA, dst_ar, dst, dstw);
++ return emit_op_mem(compiler, mem_type, dst_ar, dst, dstw);
+
+ if (sugg_dst_ar != dst_ar)
+ return ADD(sugg_dst_ar, dst_ar, ZERO);
+@@ -2194,7 +2170,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *com
+
+ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op) {
+ CHECK_ERROR();
+- check_sljit_emit_op0(compiler, op);
++ CHECK(check_sljit_emit_op0(compiler, op));
+
+ op = GET_OPCODE(op);
+ switch (op) {
+@@ -2204,10 +2180,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ case SLJIT_BREAKPOINT:
+ return PI(BPT);
+
+- case SLJIT_UMUL:
+- case SLJIT_SMUL:
+- case SLJIT_UDIV:
+- case SLJIT_SDIV:
++ case SLJIT_LUMUL:
++ case SLJIT_LSMUL:
++ case SLJIT_UDIVI:
++ case SLJIT_SDIVI:
+ SLJIT_ASSERT_STOP();
+ }
+
+@@ -2217,7 +2193,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler, sljit_si op, sljit_si dst, sljit_sw dstw, sljit_si src, sljit_sw srcw)
+ {
+ CHECK_ERROR();
+- check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw);
++ CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
+ ADJUST_LOCAL_OFFSET(dst, dstw);
+ ADJUST_LOCAL_OFFSET(src, srcw);
+
+@@ -2273,7 +2249,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler
+ return emit_op(compiler, SLJIT_SUB | GET_ALL_FLAGS(op), IMM_OP, dst, dstw, SLJIT_IMM, 0, src, srcw);
+
+ case SLJIT_CLZ:
+- return emit_op(compiler, op, 0, dst, dstw, TMP_REG1, 0, src, srcw);
++ return emit_op(compiler, op, (op & SLJIT_INT_OP) ? INT_DATA : WORD_DATA, dst, dstw, TMP_REG1, 0, src, srcw);
+ }
+
+ return SLJIT_SUCCESS;
+@@ -2282,7 +2258,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler
+ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler, sljit_si op, sljit_si dst, sljit_sw dstw, sljit_si src1, sljit_sw src1w, sljit_si src2, sljit_sw src2w)
+ {
+ CHECK_ERROR();
+- check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
++ CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
+ ADJUST_LOCAL_OFFSET(dst, dstw);
+ ADJUST_LOCAL_OFFSET(src1, src1w);
+ ADJUST_LOCAL_OFFSET(src2, src2w);
+@@ -2325,7 +2301,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_label * sljit_emit_label(struct sljit_comp
+ flush_buffer(compiler);
+
+ CHECK_ERROR_PTR();
+- check_sljit_emit_label(compiler);
++ CHECK_PTR(check_sljit_emit_label(compiler));
+
+ if (compiler->last_label && compiler->last_label->size == compiler->size)
+ return compiler->last_label;
+@@ -2344,7 +2320,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compil
+ flush_buffer(compiler);
+
+ CHECK_ERROR();
+- check_sljit_emit_ijump(compiler, type, src, srcw);
++ CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
+ ADJUST_LOCAL_OFFSET(src, srcw);
+
+ if (FAST_IS_REG(src)) {
+@@ -2404,8 +2380,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compil
+
+ return SLJIT_SUCCESS;
+
+- } else if (src & SLJIT_MEM)
++ } else if (src & SLJIT_MEM) {
+ FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_REG2, 0, TMP_REG1, 0, src, srcw));
++ flush_buffer(compiler);
++ }
+
+ FAIL_IF(JR_SOLO(reg_map[src_r]));
+
+@@ -2432,7 +2410,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump * sljit_emit_jump(struct sljit_compil
+ flush_buffer(compiler);
+
+ CHECK_ERROR_PTR();
+- check_sljit_emit_jump(compiler, type);
++ CHECK_PTR(check_sljit_emit_jump(compiler, type));
+
+ jump = (struct sljit_jump *)ensure_abuf(compiler, sizeof(struct sljit_jump));
+ PTR_FAIL_IF(!jump);
+@@ -2440,48 +2418,42 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump * sljit_emit_jump(struct sljit_compil
+ type &= 0xff;
+
+ switch (type) {
+- case SLJIT_C_EQUAL:
+- case SLJIT_C_FLOAT_NOT_EQUAL:
++ case SLJIT_EQUAL:
+ BR_NZ(EQUAL_FLAG);
+ break;
+- case SLJIT_C_NOT_EQUAL:
+- case SLJIT_C_FLOAT_EQUAL:
++ case SLJIT_NOT_EQUAL:
+ BR_Z(EQUAL_FLAG);
+ break;
+- case SLJIT_C_LESS:
+- case SLJIT_C_FLOAT_LESS:
++ case SLJIT_LESS:
+ BR_Z(ULESS_FLAG);
+ break;
+- case SLJIT_C_GREATER_EQUAL:
+- case SLJIT_C_FLOAT_GREATER_EQUAL:
++ case SLJIT_GREATER_EQUAL:
+ BR_NZ(ULESS_FLAG);
+ break;
+- case SLJIT_C_GREATER:
+- case SLJIT_C_FLOAT_GREATER:
++ case SLJIT_GREATER:
+ BR_Z(UGREATER_FLAG);
+ break;
+- case SLJIT_C_LESS_EQUAL:
+- case SLJIT_C_FLOAT_LESS_EQUAL:
++ case SLJIT_LESS_EQUAL:
+ BR_NZ(UGREATER_FLAG);
+ break;
+- case SLJIT_C_SIG_LESS:
++ case SLJIT_SIG_LESS:
+ BR_Z(LESS_FLAG);
+ break;
+- case SLJIT_C_SIG_GREATER_EQUAL:
++ case SLJIT_SIG_GREATER_EQUAL:
+ BR_NZ(LESS_FLAG);
+ break;
+- case SLJIT_C_SIG_GREATER:
++ case SLJIT_SIG_GREATER:
+ BR_Z(GREATER_FLAG);
+ break;
+- case SLJIT_C_SIG_LESS_EQUAL:
++ case SLJIT_SIG_LESS_EQUAL:
+ BR_NZ(GREATER_FLAG);
+ break;
+- case SLJIT_C_OVERFLOW:
+- case SLJIT_C_MUL_OVERFLOW:
++ case SLJIT_OVERFLOW:
++ case SLJIT_MUL_OVERFLOW:
+ BR_Z(OVERFLOW_FLAG);
+ break;
+- case SLJIT_C_NOT_OVERFLOW:
+- case SLJIT_C_MUL_NOT_OVERFLOW:
++ case SLJIT_NOT_OVERFLOW:
++ case SLJIT_MUL_NOT_OVERFLOW:
+ BR_NZ(OVERFLOW_FLAG);
+ break;
+ default:
+@@ -2536,7 +2508,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_const * sljit_emit_const(struct sljit_comp
+ flush_buffer(compiler);
+
+ CHECK_ERROR_PTR();
+- check_sljit_emit_const(compiler, dst, dstw, init_value);
++ CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
+ ADJUST_LOCAL_OFFSET(dst, dstw);
+
+ const_ = (struct sljit_const *)ensure_abuf(compiler, sizeof(struct sljit_const));
+@@ -2572,3 +2544,18 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_consta
+ inst[3] = (inst[3] & ~(0xFFFFL << 43)) | ((new_constant & 0xFFFFL) << 43);
+ SLJIT_CACHE_FLUSH(inst, inst + 4);
+ }
++
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg)
++{
++ CHECK_REG_INDEX(check_sljit_get_register_index(reg));
++ return reg_map[reg];
++}
++
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
++ void *instruction, sljit_si size)
++{
++ CHECK_ERROR();
++ CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
++ return SLJIT_ERR_UNSUPPORTED;
++}
++
+diff --git a/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c b/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c
+index 22a163f..416c15a 100644
+--- a/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c
++++ b/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c
+@@ -273,7 +273,9 @@ static sljit_si cpu_has_sse2 = -1;
+ #endif
+ static sljit_si cpu_has_cmov = -1;
+
+-#if defined(_MSC_VER) && _MSC_VER >= 1400
++#ifdef _WIN32_WCE
++#include <cmnintrin.h>
++#elif defined(_MSC_VER) && _MSC_VER >= 1400
+ #include <intrin.h>
+ #endif
+
+@@ -742,8 +744,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ break;
+ case SLJIT_LUMUL:
+ case SLJIT_LSMUL:
+- case SLJIT_LUDIV:
+- case SLJIT_LSDIV:
++ case SLJIT_UDIVMOD:
++ case SLJIT_SDIVMOD:
++ case SLJIT_UDIVI:
++ case SLJIT_SDIVI:
+ compiler->flags_saved = 0;
+ #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+ #ifdef _WIN64
+@@ -761,9 +765,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ #endif
+ compiler->mode32 = op & SLJIT_INT_OP;
+ #endif
++ SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
+
+ op = GET_OPCODE(op);
+- if (op == SLJIT_LUDIV) {
++ if ((op | 0x2) == SLJIT_UDIVI) {
+ #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
+ inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
+@@ -774,7 +779,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ *inst = XOR_r_rm;
+ }
+
+- if (op == SLJIT_LSDIV) {
++ if ((op | 0x2) == SLJIT_SDIVI) {
+ #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
+ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
+ #endif
+@@ -805,10 +810,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ FAIL_IF(!inst);
+ INC_SIZE(2);
+ *inst++ = GROUP_F7;
+- *inst = MOD_REG | ((op >= SLJIT_LUDIV) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
++ *inst = MOD_REG | ((op >= SLJIT_UDIVMOD) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
+ #else
+ #ifdef _WIN64
+- size = (!compiler->mode32 || op >= SLJIT_LUDIV) ? 3 : 2;
++ size = (!compiler->mode32 || op >= SLJIT_UDIVMOD) ? 3 : 2;
+ #else
+ size = (!compiler->mode32) ? 3 : 2;
+ #endif
+@@ -817,11 +822,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ INC_SIZE(size);
+ #ifdef _WIN64
+ if (!compiler->mode32)
+- *inst++ = REX_W | ((op >= SLJIT_LUDIV) ? REX_B : 0);
+- else if (op >= SLJIT_LUDIV)
++ *inst++ = REX_W | ((op >= SLJIT_UDIVMOD) ? REX_B : 0);
++ else if (op >= SLJIT_UDIVMOD)
+ *inst++ = REX_B;
+ *inst++ = GROUP_F7;
+- *inst = MOD_REG | ((op >= SLJIT_LUDIV) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
++ *inst = MOD_REG | ((op >= SLJIT_UDIVMOD) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
+ #else
+ if (!compiler->mode32)
+ *inst++ = REX_W;
+@@ -836,15 +841,21 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler
+ case SLJIT_LSMUL:
+ *inst |= IMUL;
+ break;
+- case SLJIT_LUDIV:
++ case SLJIT_UDIVMOD:
++ case SLJIT_UDIVI:
+ *inst |= DIV;
+ break;
+- case SLJIT_LSDIV:
++ case SLJIT_SDIVMOD:
++ case SLJIT_SDIVI:
+ *inst |= IDIV;
+ break;
+ }
+ #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
+- EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
++ if (op <= SLJIT_SDIVMOD)
++ EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
++#else
++ if (op >= SLJIT_UDIVI)
++ EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
+ #endif
+ break;
+ }
+@@ -1905,60 +1916,62 @@ static sljit_si emit_test_binary(struct sljit_compiler *compiler,
+ return SLJIT_SUCCESS;
+ }
+
+- if (FAST_IS_REG(src1)) {
++ if (!(src1 & SLJIT_IMM)) {
+ if (src2 & SLJIT_IMM) {
+ #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+ if (IS_HALFWORD(src2w) || compiler->mode32) {
+- inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
++ inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
+ FAIL_IF(!inst);
+ *inst = GROUP_F7;
+ }
+ else {
+ FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
+- inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, 0);
++ inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w);
+ FAIL_IF(!inst);
+ *inst = TEST_rm_r;
+ }
+ #else
+- inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
++ inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
+ FAIL_IF(!inst);
+ *inst = GROUP_F7;
+ #endif
++ return SLJIT_SUCCESS;
+ }
+- else {
++ else if (FAST_IS_REG(src1)) {
+ inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
+ FAIL_IF(!inst);
+ *inst = TEST_rm_r;
++ return SLJIT_SUCCESS;
+ }
+- return SLJIT_SUCCESS;
+ }
+
+- if (FAST_IS_REG(src2)) {
++ if (!(src2 & SLJIT_IMM)) {
+ if (src1 & SLJIT_IMM) {
+ #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+ if (IS_HALFWORD(src1w) || compiler->mode32) {
+- inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, 0);
++ inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
+ FAIL_IF(!inst);
+ *inst = GROUP_F7;
+ }
+ else {
+ FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
+- inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, 0);
++ inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w);
+ FAIL_IF(!inst);
+ *inst = TEST_rm_r;
+ }
+ #else
+- inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, 0);
++ inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
+ FAIL_IF(!inst);
+ *inst = GROUP_F7;
+ #endif
++ return SLJIT_SUCCESS;
+ }
+- else {
++ else if (FAST_IS_REG(src2)) {
+ inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
+ FAIL_IF(!inst);
+ *inst = TEST_rm_r;
++ return SLJIT_SUCCESS;
+ }
+- return SLJIT_SUCCESS;
+ }
+
+ EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+@@ -2923,3 +2936,69 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_consta
+ {
+ *(sljit_sw*)addr = new_constant;
+ }
++
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_sse2_available(void)
++{
++#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
++ if (cpu_has_sse2 == -1)
++ get_cpu_features();
++ return cpu_has_sse2;
++#else
++ return 1;
++#endif
++}
++
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_cmov_available(void)
++{
++ if (cpu_has_cmov == -1)
++ get_cpu_features();
++ return cpu_has_cmov;
++}
++
++SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_emit_cmov(struct sljit_compiler *compiler,
++ sljit_si type,
++ sljit_si dst_reg,
++ sljit_si src, sljit_sw srcw)
++{
++ sljit_ub* inst;
++
++ CHECK_ERROR();
++#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
++ CHECK_ARGUMENT(sljit_x86_is_cmov_available());
++ CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_INT_OP)));
++ CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_D_ORDERED);
++ CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_INT_OP));
++ FUNCTION_CHECK_SRC(src, srcw);
++#endif
++#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
++ if (SLJIT_UNLIKELY(!!compiler->verbose)) {
++ fprintf(compiler->verbose, " x86_cmov%s %s%s, ",
++ !(dst_reg & SLJIT_INT_OP) ? "" : ".i",
++ JUMP_PREFIX(type), jump_names[type & 0xff]);
++ sljit_verbose_reg(compiler, dst_reg & ~SLJIT_INT_OP);
++ fprintf(compiler->verbose, ", ");
++ sljit_verbose_param(compiler, src, srcw);
++ fprintf(compiler->verbose, "\n");
++ }
++#endif
++
++ ADJUST_LOCAL_OFFSET(src, srcw);
++ CHECK_EXTRA_REGS(src, srcw, (void)0);
++
++#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
++ compiler->mode32 = dst_reg & SLJIT_INT_OP;
++#endif
++ dst_reg &= ~SLJIT_INT_OP;
++
++ if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
++ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
++ src = TMP_REG1;
++ srcw = 0;
++ }
++
++ inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
++ FAIL_IF(!inst);
++ *inst++ = GROUP_0F;
++ *inst = get_jump_code(type & 0xff) - 0x40;
++ return SLJIT_SUCCESS;
++}