1111sace_prefix = "xn--"
1212
1313# This assumes query strings, so AllowUnassigned is true
14- def nameprep (label ):
14+ def nameprep (label ): # type: (str) -> str
1515 # Map
1616 newlabel = []
1717 for c in label :
@@ -25,7 +25,7 @@ def nameprep(label):
2525 label = unicodedata .normalize ("NFKC" , label )
2626
2727 # Prohibit
28- for c in label :
28+ for i , c in enumerate ( label ) :
2929 if stringprep .in_table_c12 (c ) or \
3030 stringprep .in_table_c22 (c ) or \
3131 stringprep .in_table_c3 (c ) or \
@@ -35,7 +35,7 @@ def nameprep(label):
3535 stringprep .in_table_c7 (c ) or \
3636 stringprep .in_table_c8 (c ) or \
3737 stringprep .in_table_c9 (c ):
38- raise UnicodeError ( "Invalid character %r" % c )
38+ raise UnicodeEncodeError ( "idna" , label , i , i + 1 , f"Invalid character { c !r } " )
3939
4040 # Check bidi
4141 RandAL = [stringprep .in_table_d1 (x ) for x in label ]
@@ -46,59 +46,73 @@ def nameprep(label):
4646 # This is table C.8, which was already checked
4747 # 2) If a string contains any RandALCat character, the string
4848 # MUST NOT contain any LCat character.
49- if any (stringprep .in_table_d2 (x ) for x in label ):
50- raise UnicodeError ("Violation of BIDI requirement 2" )
49+ for i , x in enumerate (label ):
50+ if stringprep .in_table_d2 (x ):
51+ raise UnicodeEncodeError ("idna" , label , i , i + 1 ,
52+ "Violation of BIDI requirement 2" )
5153 # 3) If a string contains any RandALCat character, a
5254 # RandALCat character MUST be the first character of the
5355 # string, and a RandALCat character MUST be the last
5456 # character of the string.
55- if not RandAL [0 ] or not RandAL [- 1 ]:
56- raise UnicodeError ("Violation of BIDI requirement 3" )
57+ if not RandAL [0 ]:
58+ raise UnicodeEncodeError ("idna" , label , 0 , 1 ,
59+ "Violation of BIDI requirement 3" )
60+ if not RandAL [- 1 ]:
61+ raise UnicodeEncodeError ("idna" , label , len (label )- 1 , len (label ),
62+ "Violation of BIDI requirement 3" )
5763
5864 return label
5965
60- def ToASCII (label ):
66+ def ToASCII (label ): # type: (str) -> bytes
6167 try :
6268 # Step 1: try ASCII
63- label = label .encode ("ascii" )
64- except UnicodeError :
69+ label_ascii = label .encode ("ascii" )
70+ except UnicodeEncodeError :
6571 pass
6672 else :
6773 # Skip to step 3: UseSTD3ASCIIRules is false, so
6874 # Skip to step 8.
69- if 0 < len (label ) < 64 :
70- return label
71- raise UnicodeError ("label empty or too long" )
75+ if 0 < len (label_ascii ) < 64 :
76+ return label_ascii
77+ if len (label ) == 0 :
78+ raise UnicodeEncodeError ("idna" , label , 0 , 1 , "label empty" )
79+ else :
80+ raise UnicodeEncodeError ("idna" , label , 0 , len (label ), "label too long" )
7281
7382 # Step 2: nameprep
7483 label = nameprep (label )
7584
7685 # Step 3: UseSTD3ASCIIRules is false
7786 # Step 4: try ASCII
7887 try :
79- label = label .encode ("ascii" )
80- except UnicodeError :
88+ label_ascii = label .encode ("ascii" )
89+ except UnicodeEncodeError :
8190 pass
8291 else :
8392 # Skip to step 8.
8493 if 0 < len (label ) < 64 :
85- return label
86- raise UnicodeError ("label empty or too long" )
94+ return label_ascii
95+ if len (label ) == 0 :
96+ raise UnicodeEncodeError ("idna" , label , 0 , 1 , "label empty" )
97+ else :
98+ raise UnicodeEncodeError ("idna" , label , 0 , len (label ), "label too long" )
8799
88100 # Step 5: Check ACE prefix
89- if label [:4 ].lower () == sace_prefix :
90- raise UnicodeError ("Label starts with ACE prefix" )
101+ if label .lower ().startswith (sace_prefix ):
102+ raise UnicodeEncodeError (
103+ "idna" , label , 0 , len (sace_prefix ), "Label starts with ACE prefix" )
91104
92105 # Step 6: Encode with PUNYCODE
93- label = label .encode ("punycode" )
106+ label_ascii = label .encode ("punycode" )
94107
95108 # Step 7: Prepend ACE prefix
96- label = ace_prefix + label
109+ label_ascii = ace_prefix + label_ascii
97110
98111 # Step 8: Check size
99- if 0 < len (label ) < 64 :
100- return label
101- raise UnicodeError ("label empty or too long" )
112+ # do not check for empty as we prepend ace_prefix.
113+ if len (label_ascii ) < 64 :
114+ return label_ascii
115+ raise UnicodeEncodeError ("idna" , label , 0 , len (label ), "label too long" )
102116
103117def ToUnicode (label ):
104118 if len (label ) > 1024 :
@@ -110,41 +124,51 @@ def ToUnicode(label):
110124 # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
111125 # preventing us from wasting time decoding a big thing that'll just
112126 # hit the actual <= 63 length limit in Step 6.
113- raise UnicodeError ("label way too long" )
127+ if isinstance (label , str ):
128+ label = label .encode ("utf-8" , errors = "backslashreplace" )
129+ raise UnicodeDecodeError ("idna" , label , 0 , len (label ), "label way too long" )
114130 # Step 1: Check for ASCII
115131 if isinstance (label , bytes ):
116132 pure_ascii = True
117133 else :
118134 try :
119135 label = label .encode ("ascii" )
120136 pure_ascii = True
121- except UnicodeError :
137+ except UnicodeEncodeError :
122138 pure_ascii = False
123139 if not pure_ascii :
140+ assert isinstance (label , str )
124141 # Step 2: Perform nameprep
125142 label = nameprep (label )
126143 # It doesn't say this, but apparently, it should be ASCII now
127144 try :
128145 label = label .encode ("ascii" )
129- except UnicodeError :
130- raise UnicodeError ("Invalid character in IDN label" )
146+ except UnicodeEncodeError as exc :
147+ raise UnicodeEncodeError ("idna" , label , exc .start , exc .end ,
148+ "Invalid character in IDN label" )
131149 # Step 3: Check for ACE prefix
132- if not label [:4 ].lower () == ace_prefix :
150+ assert isinstance (label , bytes )
151+ if not label .lower ().startswith (ace_prefix ):
133152 return str (label , "ascii" )
134153
135154 # Step 4: Remove ACE prefix
136155 label1 = label [len (ace_prefix ):]
137156
138157 # Step 5: Decode using PUNYCODE
139- result = label1 .decode ("punycode" )
158+ try :
159+ result = label1 .decode ("punycode" )
160+ except UnicodeDecodeError as exc :
161+ offset = len (ace_prefix )
162+ raise UnicodeDecodeError ("idna" , label , offset + exc .start , offset + exc .end , exc .reason )
140163
141164 # Step 6: Apply ToASCII
142165 label2 = ToASCII (result )
143166
144167 # Step 7: Compare the result of step 6 with the one of step 3
145168 # label2 will already be in lower case.
146169 if str (label , "ascii" ).lower () != str (label2 , "ascii" ):
147- raise UnicodeError ("IDNA does not round-trip" , label , label2 )
170+ raise UnicodeDecodeError ("idna" , label , 0 , len (label ),
171+ f"IDNA does not round-trip, '{ label !r} ' != '{ label2 !r} '" )
148172
149173 # Step 8: return the result of step 5
150174 return result
@@ -156,7 +180,7 @@ def encode(self, input, errors='strict'):
156180
157181 if errors != 'strict' :
158182 # IDNA is quite clear that implementations must be strict
159- raise UnicodeError ("unsupported error handling " + errors )
183+ raise UnicodeError (f"Unsupported error handling: { errors } " )
160184
161185 if not input :
162186 return b'' , 0
@@ -168,11 +192,16 @@ def encode(self, input, errors='strict'):
168192 else :
169193 # ASCII name: fast path
170194 labels = result .split (b'.' )
171- for label in labels [:- 1 ]:
172- if not (0 < len (label ) < 64 ):
173- raise UnicodeError ("label empty or too long" )
174- if len (labels [- 1 ]) >= 64 :
175- raise UnicodeError ("label too long" )
195+ for i , label in enumerate (labels [:- 1 ]):
196+ if len (label ) == 0 :
197+ offset = sum (len (l ) for l in labels [:i ]) + i
198+ raise UnicodeEncodeError ("idna" , input , offset , offset + 1 ,
199+ "label empty" )
200+ for i , label in enumerate (labels ):
201+ if len (label ) >= 64 :
202+ offset = sum (len (l ) for l in labels [:i ]) + i
203+ raise UnicodeEncodeError ("idna" , input , offset , offset + len (label ),
204+ "label too long" )
176205 return result , len (input )
177206
178207 result = bytearray ()
@@ -182,17 +211,27 @@ def encode(self, input, errors='strict'):
182211 del labels [- 1 ]
183212 else :
184213 trailing_dot = b''
185- for label in labels :
214+ for i , label in enumerate ( labels ) :
186215 if result :
187216 # Join with U+002E
188217 result .extend (b'.' )
189- result .extend (ToASCII (label ))
218+ try :
219+ result .extend (ToASCII (label ))
220+ except (UnicodeEncodeError , UnicodeDecodeError ) as exc :
221+ offset = sum (len (l ) for l in labels [:i ]) + i
222+ raise UnicodeEncodeError (
223+ "idna" ,
224+ input ,
225+ offset + exc .start ,
226+ offset + exc .end ,
227+ exc .reason ,
228+ )
190229 return bytes (result + trailing_dot ), len (input )
191230
192231 def decode (self , input , errors = 'strict' ):
193232
194233 if errors != 'strict' :
195- raise UnicodeError ("Unsupported error handling " + errors )
234+ raise UnicodeError (f "Unsupported error handling: { errors } " )
196235
197236 if not input :
198237 return "" , 0
@@ -218,16 +257,23 @@ def decode(self, input, errors='strict'):
218257 trailing_dot = ''
219258
220259 result = []
221- for label in labels :
222- result .append (ToUnicode (label ))
260+ for i , label in enumerate (labels ):
261+ try :
262+ u_label = ToUnicode (label )
263+ except (UnicodeEncodeError , UnicodeDecodeError ) as exc :
264+ offset = sum (len (x ) for x in labels [:i ]) + len (labels [:i ])
265+ raise UnicodeDecodeError (
266+ "idna" , input , offset + exc .start , offset + exc .end , exc .reason )
267+ else :
268+ result .append (u_label )
223269
224270 return "." .join (result )+ trailing_dot , len (input )
225271
226272class IncrementalEncoder (codecs .BufferedIncrementalEncoder ):
227273 def _buffer_encode (self , input , errors , final ):
228274 if errors != 'strict' :
229275 # IDNA is quite clear that implementations must be strict
230- raise UnicodeError ("unsupported error handling " + errors )
276+ raise UnicodeError (f"Unsupported error handling: { errors } " )
231277
232278 if not input :
233279 return (b'' , 0 )
@@ -251,7 +297,16 @@ def _buffer_encode(self, input, errors, final):
251297 # Join with U+002E
252298 result .extend (b'.' )
253299 size += 1
254- result .extend (ToASCII (label ))
300+ try :
301+ result .extend (ToASCII (label ))
302+ except (UnicodeEncodeError , UnicodeDecodeError ) as exc :
303+ raise UnicodeEncodeError (
304+ "idna" ,
305+ input ,
306+ size + exc .start ,
307+ size + exc .end ,
308+ exc .reason ,
309+ )
255310 size += len (label )
256311
257312 result += trailing_dot
@@ -261,7 +316,7 @@ def _buffer_encode(self, input, errors, final):
261316class IncrementalDecoder (codecs .BufferedIncrementalDecoder ):
262317 def _buffer_decode (self , input , errors , final ):
263318 if errors != 'strict' :
264- raise UnicodeError ("Unsupported error handling " + errors )
319+ raise UnicodeError ("Unsupported error handling: { errors}" )
265320
266321 if not input :
267322 return ("" , 0 )
@@ -271,7 +326,11 @@ def _buffer_decode(self, input, errors, final):
271326 labels = dots .split (input )
272327 else :
273328 # Must be ASCII string
274- input = str (input , "ascii" )
329+ try :
330+ input = str (input , "ascii" )
331+ except (UnicodeEncodeError , UnicodeDecodeError ) as exc :
332+ raise UnicodeDecodeError ("idna" , input ,
333+ exc .start , exc .end , exc .reason )
275334 labels = input .split ("." )
276335
277336 trailing_dot = ''
@@ -288,7 +347,18 @@ def _buffer_decode(self, input, errors, final):
288347 result = []
289348 size = 0
290349 for label in labels :
291- result .append (ToUnicode (label ))
350+ try :
351+ u_label = ToUnicode (label )
352+ except (UnicodeEncodeError , UnicodeDecodeError ) as exc :
353+ raise UnicodeDecodeError (
354+ "idna" ,
355+ input .encode ("ascii" , errors = "backslashreplace" ),
356+ size + exc .start ,
357+ size + exc .end ,
358+ exc .reason ,
359+ )
360+ else :
361+ result .append (u_label )
292362 if size :
293363 size += 1
294364 size += len (label )
0 commit comments