https-github-com-nzysoft
diff --git a/‎Doc/library/re.rst‎
Lines changed: 77 additions & 50 deletions b/‎Doc/library/re.rst‎
Lines changed: 77 additions & 50 deletions
diff --git a/‎Lib/_strptime.py‎
Lines changed: 2 additions & 2 deletions b/‎Lib/_strptime.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Lib/base64.py‎
Lines changed: 7 additions & 7 deletions b/‎Lib/base64.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎Lib/decimal.py‎
Lines changed: 1 addition & 1 deletion b/‎Lib/decimal.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Lib/distutils/cygwinccompiler.py‎
Lines changed: 3 additions & 3 deletions b/‎Lib/distutils/cygwinccompiler.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Lib/distutils/emxccompiler.py‎
Lines changed: 1 addition & 1 deletion b/‎Lib/distutils/emxccompiler.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Lib/distutils/sysconfig.py‎
Lines changed: 1 addition & 1 deletion b/‎Lib/distutils/sysconfig.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Lib/distutils/util.py‎
Lines changed: 1 addition & 1 deletion b/‎Lib/distutils/util.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Lib/distutils/version.py‎
Lines changed: 1 addition & 1 deletion b/‎Lib/distutils/version.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Lib/distutils/versionpredicate.py‎
Lines changed: 4 additions & 2 deletions b/‎Lib/distutils/versionpredicate.py‎
Lines changed: 4 additions & 2 deletions
@@ -11,9 +11,13 @@
 
 
 This module provides regular expression matching operations similar to
-those found in Perl. Both patterns and strings to be searched can be
-Unicode strings as well as 8-bit strings.  The :mod:`re` module is
-always available.
+those found in Perl.  The :mod:`re` module is always available.
+
+Both patterns and strings to be searched can be Unicode strings as well as
+8-bit strings. However, Unicode strings and 8-bit strings cannot be mixed:
+that is, you cannot match an Unicode string with a byte pattern or
+vice-versa; similarly, when asking for a substition, the replacement
+string must be of the same type as both the pattern and the search string.
 
 Regular expressions use the backslash character (``'\'``) to indicate
 special forms or to allow special characters to be used without invoking
@@ -212,12 +216,12 @@ The special characters are:
    group; ``(?P<name>...)`` is the only exception to this rule. Following are the
    currently supported extensions.
 
-``(?iLmsux)``
-   (One or more letters from the set ``'i'``, ``'L'``, ``'m'``, ``'s'``,
-   ``'u'``, ``'x'``.)  The group matches the empty string; the letters
-   set the corresponding flags: :const:`re.I` (ignore case),
-   :const:`re.L` (locale dependent), :const:`re.M` (multi-line),
-   :const:`re.S` (dot matches all), :const:`re.U` (Unicode dependent),
+``(?aiLmsux)``
+   (One or more letters from the set ``'a'``, ``'i'``, ``'L'``, ``'m'``,
+   ``'s'``, ``'u'``, ``'x'``.)  The group matches the empty string; the
+   letters set the corresponding flags: :const:`re.a` (ASCII-only matching),
+   :const:`re.I` (ignore case), :const:`re.L` (locale dependent),
+   :const:`re.M` (multi-line), :const:`re.S` (dot matches all), 
    and :const:`re.X` (verbose), for the entire regular expression. (The
    flags are described in :ref:`contents-of-module-re`.) This
    is useful if you wish to include the flags as part of the regular
@@ -324,56 +328,62 @@ the second character.  For example, ``\$`` matches the character ``'$'``.
    word is indicated by whitespace or a non-alphanumeric, non-underscore character.
    Note that  ``\b`` is defined as the boundary between ``\w`` and ``\ W``, so the
    precise set of characters deemed to be alphanumeric depends on the values of the
-   ``UNICODE`` and ``LOCALE`` flags.  Inside a character range, ``\b`` represents
+   ``ASCII`` and ``LOCALE`` flags.  Inside a character range, ``\b`` represents
    the backspace character, for compatibility with Python's string literals.
 
 ``\B``
    Matches the empty string, but only when it is *not* at the beginning or end of a
    word.  This is just the opposite of ``\b``, so is also subject to the settings
-   of ``LOCALE`` and ``UNICODE``.
+   of ``ASCII`` and ``LOCALE`` .
 
 ``\d``
-   When the :const:`UNICODE` flag is not specified, matches any decimal digit; this
-   is equivalent to the set ``[0-9]``.  With :const:`UNICODE`, it will match
-   whatever is classified as a digit in the Unicode character properties database.
+   For Unicode (str) patterns:
+      When the :const:`ASCII` flag is specified, matches any decimal digit; this
+      is equivalent to the set ``[0-9]``.  Otherwise, it will match whatever
+      is classified as a digit in the Unicode character properties database
+      (but this does include the standard ASCII digits and is thus a superset
+      of [0-9]).
+   For 8-bit (bytes) patterns:
+      Matches any decimal digit; this is equivalent to the set ``[0-9]``.
 
 ``\D``
-   When the :const:`UNICODE` flag is not specified, matches any non-digit
-   character; this is equivalent to the set  ``[^0-9]``.  With :const:`UNICODE`, it
-   will match  anything other than character marked as digits in the Unicode
-   character  properties database.
+   Matches any character which is not a decimal digit. This is the
+   opposite of ``\d`` and is therefore similarly subject to the settings of
+   ``ASCII`` and ``LOCALE``.
 
 ``\s``
-   When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
-   any whitespace character; this is equivalent to the set ``[ \t\n\r\f\v]``. With
-   :const:`LOCALE`, it will match this set plus whatever characters are defined as
-   space for the current locale. If :const:`UNICODE` is set, this will match the
-   characters ``[ \t\n\r\f\v]`` plus whatever is classified as space in the Unicode
-   character properties database.
+   For Unicode (str) patterns:
+      When the :const:`ASCII` flag is specified, matches only ASCII whitespace
+      characters; this is equivalent to the set ``[ \t\n\r\f\v]``. Otherwise,
+      it will match this set whatever is classified as space in the Unicode
+      character properties database (including for example the non-breaking
+      spaces mandated by typography rules in many languages).
+   For 8-bit (bytes) patterns:
+      Matches characters considered whitespace in the ASCII character set;
+      this is equivalent to the set ``[ \t\n\r\f\v]``.
 
 ``\S``
-   When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
-   any non-whitespace character; this is equivalent to the set ``[^ \t\n\r\f\v]``
-   With :const:`LOCALE`, it will match any character not in this set, and not
-   defined as space in the current locale. If :const:`UNICODE` is set, this will
-   match anything other than ``[ \t\n\r\f\v]`` and characters marked as space in
-   the Unicode character properties database.
+   Matches any character which is not a whitespace character. This is the
+   opposite of ``\s`` and is therefore similarly subject to the settings of
+   ``ASCII`` and ``LOCALE``.
 
 ``\w``
-   When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
-   any alphanumeric character and the underscore; this is equivalent to the set
-   ``[a-zA-Z0-9_]``.  With :const:`LOCALE`, it will match the set ``[0-9_]`` plus
-   whatever characters are defined as alphanumeric for the current locale.  If
-   :const:`UNICODE` is set, this will match the characters ``[0-9_]`` plus whatever
-   is classified as alphanumeric in the Unicode character properties database.
+   For Unicode (str) patterns:
+      When the :const:`ASCII` flag is specified, this is equivalent to the set
+      ``[a-zA-Z0-9_]``. Otherwise, it will match whatever is classified as
+      alphanumeric in the Unicode character properties database (it will
+      include most characters that can be part of a word in whatever language,
+      as well as numbers and the underscore sign).
+   For 8-bit (bytes) patterns:
+      Matches characters considered alphanumeric in the ASCII character set;
+      this is equivalent to the set ``[a-zA-Z0-9_]``. With :const:`LOCALE`, 
+      it will additionally match whatever characters are defined as
+      alphanumeric for the current locale.
 
 ``\W``
-   When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
-   any non-alphanumeric character; this is equivalent to the set ``[^a-zA-Z0-9_]``.
-   With :const:`LOCALE`, it will match any character not in the set ``[0-9_]``, and
-   not defined as alphanumeric for the current locale. If :const:`UNICODE` is set,
-   this will match anything other than ``[0-9_]`` and characters marked as
-   alphanumeric in the Unicode character properties database.
+   Matches any character which is not an alphanumeric character. This is the
+   opposite of ``\w`` and is therefore similarly subject to the settings of
+   ``ASCII`` and ``LOCALE``.
 
 ``\Z``
    Matches only at the end of the string.
@@ -454,6 +464,25 @@ form.
       expression at a time needn't worry about compiling regular expressions.)
 
 
+.. data:: A
+          ASCII
+
+   Make ``\w``, ``\W``, ``\b``, ``\B``, ``\s`` and ``\S`` perform ASCII-only
+   matching instead of full Unicode matching. This is only meaningful for
+   Unicode patterns, and is ignored for byte patterns.
+
+   Note that the :const:`re.U` flag still exists (as well as its synonym
+   :const:`re.UNICODE` and its embedded counterpart ``(?u)``), but it has
+   become useless in Python 3.0.
+   In previous Python versions, it was used to specify that 
+   matching had to be Unicode dependent (the default was ASCII matching in
+   all circumstances). Starting from Python 3.0, the default is Unicode 
+   matching for Unicode strings (which can be changed by specifying the
+   ``'a'`` flag), and ASCII matching for 8-bit strings. Further, Unicode
+   dependent matching for 8-bit strings isn't allowed anymore and results
+   in a ValueError.
+
+
 .. data:: I
           IGNORECASE
 
@@ -465,7 +494,10 @@ form.
           LOCALE
 
    Make ``\w``, ``\W``, ``\b``, ``\B``, ``\s`` and ``\S`` dependent on the
-   current locale.
+   current locale. The use of this flag is discouraged as the locale mechanism
+   is very unreliable, and it only handles one "culture" at a time anyway;
+   you should use Unicode matching instead, which is the default in Python 3.0
+   for Unicode (str) patterns.
 
 
 .. data:: M
@@ -486,13 +518,6 @@ form.
    newline; without this flag, ``'.'`` will match anything *except* a newline.
 
 
-.. data:: U
-          UNICODE
-
-   Make ``\w``, ``\W``, ``\b``, ``\B``, ``\d``, ``\D``, ``\s`` and ``\S`` dependent
-   on the Unicode character properties database.
-
-
 .. data:: X
           VERBOSE
 
@@ -511,6 +536,8 @@ form.
       b = re.compile(r"\d+\.\d*")
 
 
+
+
 .. function:: search(pattern, string[, flags])
 
    Scan through *string* looking for a location where the regular expression
 
@@ -14,7 +14,7 @@
 import locale
 import calendar
 from re import compile as re_compile
-from re import IGNORECASE
+from re import IGNORECASE, ASCII
 from re import escape as re_escape
 from datetime import date as datetime_date
 try:
@@ -262,7 +262,7 @@ def pattern(self, format):
 
     def compile(self, format):
         """Return a compiled re object for the format string."""
-        return re_compile(self.pattern(format), IGNORECASE)
+        return re_compile(self.pattern(format), IGNORECASE | ASCII)
 
 _cache_lock = _thread_allocate_lock()
 # DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock
 
@@ -39,7 +39,7 @@ def _translate(s, altchars):
     return s.translate(translation)
 
 
-
+
 # Base64 encoding/decoding uses binascii
 
 def b64encode(s, altchars=None):
@@ -126,7 +126,7 @@ def urlsafe_b64decode(s):
     return b64decode(s, b'-_')
 
 
-
+
 # Base32 encoding/decoding must be done in Python
 _b32alphabet = {
     0: b'A',  9: b'J', 18: b'S', 27: b'3',
@@ -225,7 +225,7 @@ def b32decode(s, casefold=False, map01=None):
     # characters because this will tell us how many null bytes to remove from
     # the end of the decoded string.
     padchars = 0
-    mo = re.search('(?P<pad>[=]*)$', s)
+    mo = re.search(b'(?P<pad>[=]*)$', s)
     if mo:
         padchars = len(mo.group('pad'))
         if padchars > 0:
@@ -262,7 +262,7 @@ def b32decode(s, casefold=False, map01=None):
     return b''.join(parts)
 
 
-
+
 # RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
 # lowercase.  The RFC also recommends against accepting input case
 # insensitively.
@@ -291,12 +291,12 @@ def b16decode(s, casefold=False):
         raise TypeError("expected bytes, not %s" % s.__class__.__name__)
     if casefold:
         s = s.upper()
-    if re.search('[^0-9A-F]', s):
+    if re.search(b'[^0-9A-F]', s):
         raise binascii.Error('Non-base16 digit found')
     return binascii.unhexlify(s)
 
 
-
+
 # Legacy interface.  This code could be cleaned up since I don't believe
 # binascii has any line length limitations.  It just doesn't seem worth it
 # though.  The files should be opened in binary mode.
@@ -353,7 +353,7 @@ def decodestring(s):
     return binascii.a2b_base64(s)
 
 
-
+
 # Usable as a script...
 def main():
     """Small main program"""
 
@@ -5415,7 +5415,7 @@ def _convert_other(other, raiseit=False):
 # 2. For finite numbers (not infinities and NaNs) the body of the
 # number between the optional sign and the optional exponent must have
 # at least one decimal digit, possibly after the decimal point.  The
-# lookahead expression '(?=\d|\.\d)' checks this.
+# lookahead expression '(?=[0-9]|\.[0-9])' checks this.
 #
 # As the flag UNICODE is not enabled here, we're explicitly avoiding any
 # other meaning for \d than the numbers [0-9].
 
@@ -409,7 +409,7 @@ def get_versions():
         out = os.popen(gcc_exe + ' -dumpversion','r')
         out_string = out.read()
         out.close()
-        result = re.search('(\d+\.\d+(\.\d+)*)',out_string)
+        result = re.search('(\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
         if result:
             gcc_version = StrictVersion(result.group(1))
         else:
@@ -421,7 +421,7 @@ def get_versions():
         out = os.popen(ld_exe + ' -v','r')
         out_string = out.read()
         out.close()
-        result = re.search('(\d+\.\d+(\.\d+)*)',out_string)
+        result = re.search('(\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
         if result:
             ld_version = StrictVersion(result.group(1))
         else:
@@ -433,7 +433,7 @@ def get_versions():
         out = os.popen(dllwrap_exe + ' --version','r')
         out_string = out.read()
         out.close()
-        result = re.search(' (\d+\.\d+(\.\d+)*)',out_string)
+        result = re.search(' (\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
         if result:
             dllwrap_version = StrictVersion(result.group(1))
         else:
 
@@ -300,7 +300,7 @@ def get_versions():
         out = os.popen(gcc_exe + ' -dumpversion','r')
         out_string = out.read()
         out.close()
-        result = re.search('(\d+\.\d+\.\d+)',out_string)
+        result = re.search('(\d+\.\d+\.\d+)', out_string, re.ASCII)
         if result:
             gcc_version = StrictVersion(result.group(1))
         else:
 
@@ -512,7 +512,7 @@ def get_config_vars(*args):
                         # patched up as well.
                         'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'):
                     flags = _config_vars[key]
-                    flags = re.sub('-arch\s+\w+\s', ' ', flags)
+                    flags = re.sub('-arch\s+\w+\s', ' ', flags, re.ASCII)
                     flags = re.sub('-isysroot [^ \t]*', ' ', flags)
                     _config_vars[key] = flags
 
 
@@ -81,7 +81,7 @@ def get_platform ():
         return "%s-%s.%s" % (osname, version, release)
     elif osname[:6] == "cygwin":
         osname = "cygwin"
-        rel_re = re.compile (r'[\d.]+')
+        rel_re = re.compile (r'[\d.]+', re.ASCII)
         m = rel_re.match(release)
         if m:
             release = m.group()
 
@@ -134,7 +134,7 @@ class StrictVersion (Version):
     """
 
     version_re = re.compile(r'^(\d+) \. (\d+) (\. (\d+))? ([ab](\d+))?$',
-                            re.VERBOSE)
+                            re.VERBOSE | re.ASCII)
 
 
     def parse (self, vstring):
 
@@ -5,7 +5,8 @@
 import operator
 
 
-re_validPackage = re.compile(r"(?i)^\s*([a-z_]\w*(?:\.[a-z_]\w*)*)(.*)")
+re_validPackage = re.compile(r"(?i)^\s*([a-z_]\w*(?:\.[a-z_]\w*)*)(.*)",
+    re.ASCII)
 # (package) (rest)
 
 re_paren = re.compile(r"^\s*\((.*)\)\s*$") # (list) inside of parentheses
@@ -153,7 +154,8 @@ def split_provision(value):
     global _provision_rx
     if _provision_rx is None:
         _provision_rx = re.compile(
-            "([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)(?:\s*\(\s*([^)\s]+)\s*\))?$")
+            "([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)(?:\s*\(\s*([^)\s]+)\s*\))?$",
+            re.ASCII)
     value = value.strip()
     m = _provision_rx.match(value)
     if not m:
Original file line number	Diff line number	Diff line change
`@@ -5415,7 +5415,7 @@ def _convert_other(other, raiseit=False):`
`5415`	`5415`	`# 2. For finite numbers (not infinities and NaNs) the body of the`
`5416`	`5416`	`# number between the optional sign and the optional exponent must have`
`5417`	`5417`	`# at least one decimal digit, possibly after the decimal point. The`
`5418`		`-# lookahead expression '(?=\d\|\.\d)' checks this.`
	`5418`	`+# lookahead expression '(?=[0-9]\|\.[0-9])' checks this.`
`5419`	`5419`	`#`
`5420`	`5420`	`# As the flag UNICODE is not enabled here, we're explicitly avoiding any`
`5421`	`5421`	`# other meaning for \d than the numbers [0-9].`