@@ -37,6 +37,22 @@ mod _json {
3737 count
3838 }
3939
40+ /// Check if a character iterator starts with a given pattern.
41+ /// This avoids byte/char index mismatch issues with non-ASCII strings.
42+ #[ inline]
43+ fn starts_with_chars < I > ( mut chars : I , pattern : & str ) -> bool
44+ where
45+ I : Iterator < Item = char > ,
46+ {
47+ for expected in pattern. chars ( ) {
48+ match chars. next ( ) {
49+ Some ( c) if c == expected => continue ,
50+ _ => return false ,
51+ }
52+ }
53+ true
54+ }
55+
4056 #[ pyattr( name = "make_scanner" ) ]
4157 #[ pyclass( name = "Scanner" , traverse) ]
4258 #[ derive( Debug , PyPayload ) ]
@@ -202,6 +218,50 @@ mod _json {
202218 Some ( ( ret, buf. len ( ) ) )
203219 }
204220
221+ /// Parse a number from a character iterator.
222+ /// Returns (result, character_count) where character_count is the number of chars consumed.
223+ fn parse_number_from_chars < I > ( & self , chars : I , vm : & VirtualMachine ) -> Option < ( PyResult , usize ) >
224+ where
225+ I : Iterator < Item = char > ,
226+ {
227+ let mut buf = String :: new ( ) ;
228+ let mut has_neg = false ;
229+ let mut has_decimal = false ;
230+ let mut has_exponent = false ;
231+ let mut has_e_sign = false ;
232+
233+ for c in chars {
234+ let i = buf. len ( ) ;
235+ match c {
236+ '-' if i == 0 => has_neg = true ,
237+ n if n. is_ascii_digit ( ) => { }
238+ '.' if !has_decimal => has_decimal = true ,
239+ 'e' | 'E' if !has_exponent => has_exponent = true ,
240+ '+' | '-' if !has_e_sign => has_e_sign = true ,
241+ _ => break ,
242+ }
243+ buf. push ( c) ;
244+ }
245+
246+ let len = buf. len ( ) ;
247+ if len == 0 || ( len == 1 && has_neg) {
248+ return None ;
249+ }
250+
251+ let ret = if has_decimal || has_exponent {
252+ if let Some ( ref parse_float) = self . parse_float {
253+ parse_float. call ( ( & buf, ) , vm)
254+ } else {
255+ Ok ( vm. ctx . new_float ( f64:: from_str ( & buf) . unwrap ( ) ) . into ( ) )
256+ }
257+ } else if let Some ( ref parse_int) = self . parse_int {
258+ parse_int. call ( ( & buf, ) , vm)
259+ } else {
260+ Ok ( vm. new_pyobj ( BigInt :: from_str ( & buf) . unwrap ( ) ) )
261+ } ;
262+ Some ( ( ret, len) )
263+ }
264+
205265 /// Parse a JSON object starting after the opening '{'.
206266 /// Returns (parsed_object, end_character_index).
207267 fn parse_object (
@@ -458,6 +518,7 @@ mod _json {
458518 }
459519
460520 /// Call scan_once and handle the result.
521+ /// Uses character iterators to avoid byte/char index mismatch with non-ASCII strings.
461522 fn call_scan_once (
462523 & self ,
463524 scan_once : & PyObjectRef ,
@@ -466,100 +527,92 @@ mod _json {
466527 memo : & mut HashMap < String , PyStrRef > ,
467528 vm : & VirtualMachine ,
468529 ) -> PyResult < ( PyObjectRef , usize ) > {
469- // First try to handle common cases directly in Rust
470530 let s = pystr. as_str ( ) ;
471- let mut chars = s. chars ( ) . skip ( idx) . peekable ( ) ;
531+ let chars = s. chars ( ) . skip ( idx) . peekable ( ) ;
472532
473- let remaining = & s[ idx..] ;
533+ let first_char = match chars. clone ( ) . next ( ) {
534+ Some ( c) => c,
535+ None => return Err ( self . make_decode_error ( "Expecting value" , pystr, idx, vm) ) ,
536+ } ;
474537
475- match chars . peek ( ) {
476- Some ( '"' ) => {
477- // String - parse directly in Rust
538+ match first_char {
539+ '"' => {
540+ // String
478541 let ( wtf8, end) = machinery:: scanstring ( pystr. as_wtf8 ( ) , idx + 1 , self . strict )
479542 . map_err ( |e| py_decode_error ( e, pystr. clone ( ) , vm) ) ?;
480543 let py_str = vm. ctx . new_str ( wtf8. to_string ( ) ) ;
481- return Ok ( ( py_str. into ( ) , end) ) ;
544+ Ok ( ( py_str. into ( ) , end) )
482545 }
483- Some ( '{' ) => {
484- // Nested object - parse recursively in Rust
485- return self . parse_object ( pystr, idx + 1 , scan_once, memo, vm) ;
486- }
487- Some ( '[' ) => {
488- // Nested array - parse recursively in Rust
489- return self . parse_array ( pystr, idx + 1 , scan_once, memo, vm) ;
546+ '{' => {
547+ // Object
548+ self . parse_object ( pystr, idx + 1 , scan_once, memo, vm)
490549 }
491- Some ( 'n' ) => {
492- // null - parse directly in Rust
493- if remaining. starts_with ( "null" ) {
494- return Ok ( ( vm. ctx . none ( ) , idx + 4 ) ) ;
495- }
550+ '[' => {
551+ // Array
552+ self . parse_array ( pystr, idx + 1 , scan_once, memo, vm)
496553 }
497- Some ( 't' ) => {
498- // true - parse directly in Rust
499- if remaining. starts_with ( "true" ) {
500- return Ok ( ( vm. ctx . new_bool ( true ) . into ( ) , idx + 4 ) ) ;
501- }
554+ 'n' if starts_with_chars ( chars. clone ( ) , "null" ) => {
555+ // null
556+ Ok ( ( vm. ctx . none ( ) , idx + 4 ) )
502557 }
503- Some ( 'f' ) => {
504- // false - parse directly in Rust
505- if remaining. starts_with ( "false" ) {
506- return Ok ( ( vm. ctx . new_bool ( false ) . into ( ) , idx + 5 ) ) ;
507- }
558+ 't' if starts_with_chars ( chars. clone ( ) , "true" ) => {
559+ // true
560+ Ok ( ( vm. ctx . new_bool ( true ) . into ( ) , idx + 4 ) )
508561 }
509- Some ( c) if c. is_ascii_digit ( ) => {
510- // Number starting with digit - parse directly in Rust
511- if let Some ( ( result, len) ) = self . parse_number ( remaining, vm) {
512- return Ok ( ( result?, idx + len) ) ;
513- }
562+ 'f' if starts_with_chars ( chars. clone ( ) , "false" ) => {
563+ // false
564+ Ok ( ( vm. ctx . new_bool ( false ) . into ( ) , idx + 5 ) )
514565 }
515- Some ( 'N' ) => {
516- // NaN - parse directly in Rust
517- if remaining. starts_with ( "NaN" ) {
518- let result = self . parse_constant . call ( ( "NaN" , ) , vm) ?;
519- return Ok ( ( result, idx + 3 ) ) ;
520- }
566+ 'N' if starts_with_chars ( chars. clone ( ) , "NaN" ) => {
567+ // NaN
568+ let result = self . parse_constant . call ( ( "NaN" , ) , vm) ?;
569+ Ok ( ( result, idx + 3 ) )
521570 }
522- Some ( 'I' ) => {
523- // Infinity - parse directly in Rust
524- if remaining. starts_with ( "Infinity" ) {
525- let result = self . parse_constant . call ( ( "Infinity" , ) , vm) ?;
526- return Ok ( ( result, idx + 8 ) ) ;
527- }
571+ 'I' if starts_with_chars ( chars. clone ( ) , "Infinity" ) => {
572+ // Infinity
573+ let result = self . parse_constant . call ( ( "Infinity" , ) , vm) ?;
574+ Ok ( ( result, idx + 8 ) )
528575 }
529- Some ( '-' ) => {
576+ '-' => {
530577 // -Infinity or negative number
531- if remaining . starts_with ( "-Infinity" ) {
578+ if starts_with_chars ( chars . clone ( ) , "-Infinity" ) {
532579 let result = self . parse_constant . call ( ( "-Infinity" , ) , vm) ?;
533580 return Ok ( ( result, idx + 9 ) ) ;
534581 }
535- // Try parsing as negative number
536- if let Some ( ( result, len) ) = self . parse_number ( remaining , vm) {
582+ // Negative number - collect number characters
583+ if let Some ( ( result, len) ) = self . parse_number_from_chars ( chars , vm) {
537584 return Ok ( ( result?, idx + len) ) ;
538585 }
586+ Err ( self . make_decode_error ( "Expecting value" , pystr, idx, vm) )
539587 }
540- _ => {
541- // fall through to call scan_once
542- }
543- }
544-
545- // Fall back to scan_once for other value types
546- let result = scan_once. call ( ( pystr. clone ( ) , idx as isize ) , vm) ;
547-
548- match result {
549- Ok ( tuple) => {
550- use crate :: vm:: builtins:: PyTupleRef ;
551- let tuple: PyTupleRef = tuple. try_into_value ( vm) ?;
552- if tuple. len ( ) != 2 {
553- return Err ( vm. new_value_error ( "scan_once must return 2-tuple" ) ) ;
588+ c if c. is_ascii_digit ( ) => {
589+ // Positive number
590+ if let Some ( ( result, len) ) = self . parse_number_from_chars ( chars, vm) {
591+ return Ok ( ( result?, idx + len) ) ;
554592 }
555- let value = tuple. as_slice ( ) [ 0 ] . clone ( ) ;
556- let end_idx: isize = tuple. as_slice ( ) [ 1 ] . try_to_value ( vm) ?;
557- Ok ( ( value, end_idx as usize ) )
558- }
559- Err ( err) if err. fast_isinstance ( vm. ctx . exceptions . stop_iteration ) => {
560593 Err ( self . make_decode_error ( "Expecting value" , pystr, idx, vm) )
561594 }
562- Err ( err) => Err ( err) ,
595+ _ => {
596+ // Fall back to scan_once for unrecognized input
597+ let result = scan_once. call ( ( pystr. clone ( ) , idx as isize ) , vm) ;
598+
599+ match result {
600+ Ok ( tuple) => {
601+ use crate :: vm:: builtins:: PyTupleRef ;
602+ let tuple: PyTupleRef = tuple. try_into_value ( vm) ?;
603+ if tuple. len ( ) != 2 {
604+ return Err ( vm. new_value_error ( "scan_once must return 2-tuple" ) ) ;
605+ }
606+ let value = tuple. as_slice ( ) [ 0 ] . clone ( ) ;
607+ let end_idx: isize = tuple. as_slice ( ) [ 1 ] . try_to_value ( vm) ?;
608+ Ok ( ( value, end_idx as usize ) )
609+ }
610+ Err ( err) if err. fast_isinstance ( vm. ctx . exceptions . stop_iteration ) => {
611+ Err ( self . make_decode_error ( "Expecting value" , pystr, idx, vm) )
612+ }
613+ Err ( err) => Err ( err) ,
614+ }
615+ }
563616 }
564617 }
565618
0 commit comments