python · serhiy-storchaka · Oct 24, 2017 · Oct 4, 2017 · Oct 4, 2017 · Oct 5, 2017
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
@@ -245,16 +245,32 @@ The special characters are:
    *cannot* be retrieved after performing a match or referenced later in the
    pattern.
 
-``(?imsx-imsx:...)``
-   (Zero or more letters from the set ``'i'``, ``'m'``, ``'s'``, ``'x'``,
-   optionally followed by ``'-'`` followed by one or more letters from the
-   same set.)  The letters set or removes the corresponding flags:
-   :const:`re.I` (ignore case), :const:`re.M` (multi-line), :const:`re.S`
-   (dot matches all), and :const:`re.X` (verbose), for the part of the
-   expression.  (The flags are described in :ref:`contents-of-module-re`.)
+``(?aiLmsux-imsx:...)``
+   (Zero or more letters from the set ``'a'``, ``'i'``, ``'L'``, ``'m'``,
+   ``'s'``, ``'u'``, ``'x'``, optionally followed by ``'-'`` followed by
+   one or more letters from the ``'i'``, ``'m'``, ``'s'``, ``'x'``.)
+   The letters set or remove the corresponding flags:
+   :const:`re.A` (ASCII-only matching), :const:`re.I` (ignore case),
+   :const:`re.L` (locale dependent), :const:`re.M` (multi-line),
+   :const:`re.S` (dot matches all), :const:`re.U` (Unicode matching),
+   and :const:`re.X` (verbose), for the part of the expression.
+   (The flags are described in :ref:`contents-of-module-re`.)
+
+   The letters ``'a'``, ``'L'`` and ``'u'`` are mutually exclusive when used
+   as inline flags, so they can't be combined or follow ``'-'``.  Instead,
+   when one of them appears in an inline group, it overrides the matching mode
+   in the enclosing group.  In Unicode patterns ``(?a:...)`` switches to
+   ASCII-only matching, and ``(?u:...)`` switches to Unicode matching
+   (default).  In byte pattern ``(?L:...)`` switches to locale depending
+   matching, and ``(?a:...)`` switches to ASCII-only matching (default).
+   This override is only in effect for the narrow inline group, and the
+   original matching mode is restored outside of the group.
 
    .. versionadded:: 3.6
 
+   .. versionchanged:: 3.7
+      The letters ``'a'``, ``'L'`` and ``'u'`` also can be used in a group.
+
 ``(?P<name>...)``
    Similar to regular parentheses, but the substring matched by the group is
    accessible via the symbolic group name *name*.  Group names must be valid
@@ -384,29 +400,23 @@ character ``'$'``.
       Matches any Unicode decimal digit (that is, any character in
       Unicode character category [Nd]).  This includes ``[0-9]``, and
       also many other digit characters.  If the :const:`ASCII` flag is
-      used only ``[0-9]`` is matched (but the flag affects the entire
-      regular expression, so in such cases using an explicit ``[0-9]``
-      may be a better choice).
+      used only ``[0-9]`` is matched.
 
    For 8-bit (bytes) patterns:
       Matches any decimal digit; this is equivalent to ``[0-9]``.
 
 ``\D``
    Matches any character which is not a decimal digit. This is
    the opposite of ``\d``. If the :const:`ASCII` flag is used this
-   becomes the equivalent of ``[^0-9]`` (but the flag affects the entire
-   regular expression, so in such cases using an explicit ``[^0-9]`` may
-   be a better choice).
+   becomes the equivalent of ``[^0-9]``.
 
 ``\s``
    For Unicode (str) patterns:
       Matches Unicode whitespace characters (which includes
       ``[ \t\n\r\f\v]``, and also many other characters, for example the
       non-breaking spaces mandated by typography rules in many
       languages). If the :const:`ASCII` flag is used, only
-      ``[ \t\n\r\f\v]`` is matched (but the flag affects the entire
-      regular expression, so in such cases using an explicit
-      ``[ \t\n\r\f\v]`` may be a better choice).
+      ``[ \t\n\r\f\v]`` is matched.
 
    For 8-bit (bytes) patterns:
       Matches characters considered whitespace in the ASCII character set;
@@ -415,18 +425,14 @@ character ``'$'``.
 ``\S``
    Matches any character which is not a whitespace character. This is
    the opposite of ``\s``. If the :const:`ASCII` flag is used this
-   becomes the equivalent of ``[^ \t\n\r\f\v]`` (but the flag affects the entire
-   regular expression, so in such cases using an explicit ``[^ \t\n\r\f\v]`` may
-   be a better choice).
+   becomes the equivalent of ``[^ \t\n\r\f\v]``.
 
 ``\w``
    For Unicode (str) patterns:
       Matches Unicode word characters; this includes most characters
       that can be part of a word in any language, as well as numbers and
       the underscore. If the :const:`ASCII` flag is used, only
-      ``[a-zA-Z0-9_]`` is matched (but the flag affects the entire
-      regular expression, so in such cases using an explicit
-      ``[a-zA-Z0-9_]`` may be a better choice).
+      ``[a-zA-Z0-9_]`` is matched.
 
    For 8-bit (bytes) patterns:
       Matches characters considered alphanumeric in the ASCII character set;
@@ -437,9 +443,7 @@ character ``'$'``.
 ``\W``
    Matches any character which is not a word character. This is
    the opposite of ``\w``. If the :const:`ASCII` flag is used this
-   becomes the equivalent of ``[^a-zA-Z0-9_]`` (but the flag affects the
-   entire regular expression, so in such cases using an explicit
-   ``[^a-zA-Z0-9_]`` may be a better choice).  If the :const:`LOCALE` flag is
+   becomes the equivalent of ``[^a-zA-Z0-9_]``.  If the :const:`LOCALE` flag is
    used, matches characters considered alphanumeric in the current locale
    and the underscore.
 
@@ -563,9 +567,7 @@ form.
    letter I with dot above), 'ı' (U+0131, Latin small letter dotless i),
    'ſ' (U+017F, Latin small letter long s) and 'K' (U+212A, Kelvin sign).
    If the :const:`ASCII` flag is used, only letters 'a' to 'z'
-   and 'A' to 'Z' are matched (but the flag affects the entire regular
-   expression, so in such cases using an explicit ``(?-i:[a-zA-Z])`` may be
-   a better choice).
+   and 'A' to 'Z' are matched.
 
 .. data:: L
           LOCALE

diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst
@@ -290,6 +290,13 @@ pdb
 argument.  If given, this is printed to the console just before debugging
 begins.
 
+re
+--
+
+The flags :const:`re.ASCII`, :const:`re.LOCALE` and :const:`re.UNICODE`
+can be set within the scope of a group.
+(Contributed by Serhiy Storchaka in :issue:`31690`.)
+
 string
 ------
 

diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
@@ -62,6 +62,12 @@
 _ignorecase_fixes = {i: tuple(j for j in t if i != j)
                      for t in _equivalences for i in t}
 
+def _combine_flags(flags, add_flags, del_flags,
+                   TYPE_FLAGS=sre_parse.TYPE_FLAGS):
+    if add_flags & TYPE_FLAGS:
+        flags &= ~TYPE_FLAGS
+    return (flags | add_flags) & ~del_flags
+
 def _compile(code, pattern, flags):
     # internal: compile a (sub)pattern
     emit = code.append
@@ -87,15 +93,21 @@ def _compile(code, pattern, flags):
                 emit(op)
                 emit(av)
             elif flags & SRE_FLAG_LOCALE:
-                emit(OP_LOC_IGNORE[op])
+                emit(OP_LOCALE_IGNORE[op])
                 emit(av)
             elif not iscased(av):
                 emit(op)
                 emit(av)
             else:
                 lo = tolower(av)
-                if fixes and lo in fixes:
-                    emit(IN_IGNORE)
+                if not fixes:  # ascii
+                    emit(OP_IGNORE[op])
+                    emit(lo)
+                elif lo not in fixes:
+                    emit(OP_UNICODE_IGNORE[op])
+                    emit(lo)
+                else:
+                    emit(IN_UNI_IGNORE)
                     skip = _len(code); emit(0)
                     if op is NOT_LITERAL:
                         emit(NEGATE)
@@ -104,17 +116,16 @@ def _compile(code, pattern, flags):
                         emit(k)
                     emit(FAILURE)
                     code[skip] = _len(code) - skip
-                else:
-                    emit(OP_IGNORE[op])
-                    emit(lo)
         elif op is IN:
             charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
             if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
                 emit(IN_LOC_IGNORE)
-            elif hascased:
+            elif not hascased:
+                emit(IN)
+            elif not fixes:  # ascii
                 emit(IN_IGNORE)
             else:
-                emit(IN)
+                emit(IN_UNI_IGNORE)
             skip = _len(code); emit(0)
             _compile_charset(charset, flags, code)
             code[skip] = _len(code) - skip
@@ -153,8 +164,8 @@ def _compile(code, pattern, flags):
             if group:
                 emit(MARK)
                 emit((group-1)*2)
-            # _compile_info(code, p, (flags | add_flags) & ~del_flags)
-            _compile(code, p, (flags | add_flags) & ~del_flags)
+            # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
+            _compile(code, p, _combine_flags(flags, add_flags, del_flags))
             if group:
                 emit(MARK)
                 emit((group-1)*2+1)
@@ -210,10 +221,14 @@ def _compile(code, pattern, flags):
                 av = CH_UNICODE[av]
             emit(av)
         elif op is GROUPREF:
-            if flags & SRE_FLAG_IGNORECASE:
-                emit(OP_IGNORE[op])
-            else:
+            if not flags & SRE_FLAG_IGNORECASE:
                 emit(op)
+            elif flags & SRE_FLAG_LOCALE:
+                emit(GROUPREF_LOC_IGNORE)
+            elif not fixes:  # ascii
+                emit(GROUPREF_IGNORE)
+            else:
+                emit(GROUPREF_UNI_IGNORE)
             emit(av-1)
         elif op is GROUPREF_EXISTS:
             emit(op)
@@ -240,7 +255,7 @@ def _compile_charset(charset, flags, code):
             pass
         elif op is LITERAL:
             emit(av)
-        elif op is RANGE or op is RANGE_IGNORE:
+        elif op is RANGE or op is RANGE_UNI_IGNORE:
             emit(av[0])
             emit(av[1])
         elif op is CHARSET:
@@ -309,9 +324,9 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
                     hascased = True
                     # There are only two ranges of cased non-BMP characters:
                     # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
-                    # and for both ranges RANGE_IGNORE works.
+                    # and for both ranges RANGE_UNI_IGNORE works.
                     if op is RANGE:
-                        op = RANGE_IGNORE
+                        op = RANGE_UNI_IGNORE
                 tail.append((op, av))
             break
 
@@ -456,7 +471,7 @@ def _get_literal_prefix(pattern, flags):
             prefixappend(av)
         elif op is SUBPATTERN:
             group, add_flags, del_flags, p = av
-            flags1 = (flags | add_flags) & ~del_flags
+            flags1 = _combine_flags(flags, add_flags, del_flags)
             if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
                 break
             prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
@@ -482,7 +497,7 @@ def _get_charset_prefix(pattern, flags):
         if op is not SUBPATTERN:
             break
         group, add_flags, del_flags, pattern = av
-        flags = (flags | add_flags) & ~del_flags
+        flags = _combine_flags(flags, add_flags, del_flags)
         if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
             return None
 
@@ -631,6 +646,7 @@ def print_2(*args):
                 print_(op)
             elif op in (LITERAL, NOT_LITERAL,
                         LITERAL_IGNORE, NOT_LITERAL_IGNORE,
+                        LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
                         LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
                 arg = code[i]
                 i += 1
@@ -647,12 +663,12 @@ def print_2(*args):
                 arg = str(CHCODES[arg])
                 assert arg[:9] == 'CATEGORY_'
                 print_(op, arg[9:])
-            elif op in (IN, IN_IGNORE, IN_LOC_IGNORE):
+            elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
                 skip = code[i]
                 print_(op, skip, to=i+skip)
                 dis_(i+1, i+skip)
                 i += skip
-            elif op in (RANGE, RANGE_IGNORE):
+            elif op in (RANGE, RANGE_UNI_IGNORE):
                 lo, hi = code[i: i+2]
                 i += 2
                 print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
@@ -671,7 +687,8 @@ def print_2(*args):
                     print_2(_hex_code(code[i: i + 256//_CODEBITS]))
                     i += 256//_CODEBITS
                 level -= 1
-            elif op in (MARK, GROUPREF, GROUPREF_IGNORE):
+            elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
+                        GROUPREF_LOC_IGNORE):
                 arg = code[i]
                 i += 1
                 print_(op, arg)

diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
 
 # update when constants are added or removed
 
-MAGIC = 20170530
+MAGIC = 20171005
 
 from _sre import MAXREPEAT, MAXGROUPS
 
@@ -84,25 +84,37 @@ def _makecodes(names):
     CALL
     CATEGORY
     CHARSET BIGCHARSET
-    GROUPREF GROUPREF_EXISTS GROUPREF_IGNORE
-    IN IN_IGNORE
+    GROUPREF GROUPREF_EXISTS
+    IN
     INFO
     JUMP
-    LITERAL LITERAL_IGNORE
+    LITERAL
     MARK
     MAX_UNTIL
     MIN_UNTIL
-    NOT_LITERAL NOT_LITERAL_IGNORE
+    NOT_LITERAL
     NEGATE
     RANGE
     REPEAT
     REPEAT_ONE
     SUBPATTERN
     MIN_REPEAT_ONE
-    RANGE_IGNORE
+
+    GROUPREF_IGNORE
+    IN_IGNORE
+    LITERAL_IGNORE
+    NOT_LITERAL_IGNORE
+
+    GROUPREF_LOC_IGNORE
+    IN_LOC_IGNORE
     LITERAL_LOC_IGNORE
     NOT_LITERAL_LOC_IGNORE
-    IN_LOC_IGNORE
+
+    GROUPREF_UNI_IGNORE
+    IN_UNI_IGNORE
+    LITERAL_UNI_IGNORE
+    NOT_LITERAL_UNI_IGNORE
+    RANGE_UNI_IGNORE
 
     MIN_REPEAT MAX_REPEAT
 """)
@@ -113,7 +125,9 @@ def _makecodes(names):
     AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING
     AT_BOUNDARY AT_NON_BOUNDARY
     AT_END AT_END_LINE AT_END_STRING
+
     AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY
+
     AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY
 """)
 
@@ -123,7 +137,9 @@ def _makecodes(names):
     CATEGORY_SPACE CATEGORY_NOT_SPACE
     CATEGORY_WORD CATEGORY_NOT_WORD
     CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK
+
     CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD
+
     CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT
     CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE
     CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD
@@ -133,18 +149,20 @@ def _makecodes(names):
 
 # replacement operations for "ignore case" mode
 OP_IGNORE = {
-    GROUPREF: GROUPREF_IGNORE,
-    IN: IN_IGNORE,
     LITERAL: LITERAL_IGNORE,
     NOT_LITERAL: NOT_LITERAL_IGNORE,
-    RANGE: RANGE_IGNORE,
 }
 
-OP_LOC_IGNORE = {
+OP_LOCALE_IGNORE = {
     LITERAL: LITERAL_LOC_IGNORE,
     NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
 }
 
+OP_UNICODE_IGNORE = {
+    LITERAL: LITERAL_UNI_IGNORE,
+    NOT_LITERAL: NOT_LITERAL_UNI_IGNORE,
+}
+
 AT_MULTILINE = {
     AT_BEGINNING: AT_BEGINNING_LINE,
     AT_END: AT_END_LINE