Skip to content

Commit f3ebc9f

Browse files
Issue python#20491: The textwrap.TextWrapper class now honors non-breaking spaces.
Based on patch by Kaarle Ritvanen.
1 parent 42babab commit f3ebc9f

File tree

3 files changed

+47
-14
lines changed

3 files changed

+47
-14
lines changed

Lib/test/test_textwrap.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,37 @@ def test_umlaut_followed_by_dash(self):
444444
text = "aa \xe4\xe4-\xe4\xe4"
445445
self.check_wrap(text, 7, ["aa \xe4\xe4-", "\xe4\xe4"])
446446

447+
def test_non_breaking_space(self):
448+
text = 'This is a sentence with non-breaking\N{NO-BREAK SPACE}space.'
449+
450+
self.check_wrap(text, 20,
451+
['This is a sentence',
452+
'with non-',
453+
'breaking\N{NO-BREAK SPACE}space.'],
454+
break_on_hyphens=True)
455+
456+
self.check_wrap(text, 20,
457+
['This is a sentence',
458+
'with',
459+
'non-breaking\N{NO-BREAK SPACE}space.'],
460+
break_on_hyphens=False)
461+
462+
def test_narrow_non_breaking_space(self):
463+
text = ('This is a sentence with non-breaking'
464+
'\N{NARROW NO-BREAK SPACE}space.')
465+
466+
self.check_wrap(text, 20,
467+
['This is a sentence',
468+
'with non-',
469+
'breaking\N{NARROW NO-BREAK SPACE}space.'],
470+
break_on_hyphens=True)
471+
472+
self.check_wrap(text, 20,
473+
['This is a sentence',
474+
'with',
475+
'non-breaking\N{NARROW NO-BREAK SPACE}space.'],
476+
break_on_hyphens=False)
477+
447478

448479
class MaxLinesTestCase(BaseTestCase):
449480
text = "Hello there, how are you this fine day? I'm glad to hear it!"

Lib/textwrap.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,8 @@
1010
__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
1111

1212
# Hardcode the recognized whitespace characters to the US-ASCII
13-
# whitespace characters. The main reason for doing this is that in
14-
# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
15-
# that character winds up in string.whitespace. Respecting
16-
# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
17-
# same as any other whitespace char, which is clearly wrong (it's a
18-
# *non-breaking* space), 2) possibly cause problems with Unicode,
19-
# since 0xa0 is not in range(128).
13+
# whitespace characters. The main reason for doing this is that
14+
# some Unicode spaces (like \u00a0) are non-breaking whitespaces.
2015
_whitespace = '\t\n\x0b\x0c\r '
2116

2217
class TextWrapper:
@@ -81,29 +76,34 @@ class TextWrapper:
8176
# (after stripping out empty strings).
8277
word_punct = r'[\w!"\'&.,?]'
8378
letter = r'[^\d\W]'
79+
whitespace = r'[%s]' % re.escape(_whitespace)
80+
nowhitespace = '[^' + whitespace[1:]
8481
wordsep_re = re.compile(r'''
8582
( # any whitespace
86-
\s+
83+
%(ws)s+
8784
| # em-dash between words
8885
(?<=%(wp)s) -{2,} (?=\w)
8986
| # word, possibly hyphenated
90-
\S+? (?:
87+
%(nws)s+? (?:
9188
# hyphenated word
9289
-(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
9390
(?= %(lt)s -? %(lt)s)
9491
| # end of word
95-
(?=\s|\Z)
92+
(?=%(ws)s|\Z)
9693
| # em-dash
9794
(?<=%(wp)s) (?=-{2,}\w)
9895
)
99-
)''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE)
100-
del word_punct, letter
96+
)''' % {'wp': word_punct, 'lt': letter,
97+
'ws': whitespace, 'nws': nowhitespace},
98+
re.VERBOSE)
99+
del word_punct, letter, nowhitespace
101100

102101
# This less funky little regex just split on recognized spaces. E.g.
103102
# "Hello there -- you goof-ball, use the -b option!"
104103
# splits into
105104
# Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
106-
wordsep_simple_re = re.compile(r'(\s+)')
105+
wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
106+
del whitespace
107107

108108
# XXX this is not locale- or charset-aware -- string.lowercase
109109
# is US-ASCII only (and therefore English-only)
@@ -112,7 +112,6 @@ class TextWrapper:
112112
r'[\"\']?' # optional end-of-quote
113113
r'\Z') # end of chunk
114114

115-
116115
def __init__(self,
117116
width=70,
118117
initial_indent="",

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,9 @@ Core and Builtins
113113
Library
114114
-------
115115

116+
- Issue #20491: The textwrap.TextWrapper class now honors non-breaking spaces.
117+
Based on patch by Kaarle Ritvanen.
118+
116119
- Issue #28353: os.fwalk() no longer fails on broken links.
117120

118121
- Issue #25464: Fixed HList.header_exists() in tkinter.tix module by addin

0 commit comments

Comments
 (0)