3131 DtypeArg ,
3232 FilePath ,
3333 IndexLabel ,
34+ JSONEngine ,
3435 JSONSerializable ,
3536 ReadBuffer ,
3637 StorageOptions ,
6970 build_table_schema ,
7071 parse_table_schema ,
7172)
73+ from pandas .io .json .arrow_json_parser_wrapper import ArrowJsonParserWrapper
7274from pandas .io .parsers .readers import validate_integer
7375
7476if TYPE_CHECKING :
@@ -389,6 +391,7 @@ def read_json(
389391 date_unit : str | None = ...,
390392 encoding : str | None = ...,
391393 encoding_errors : str | None = ...,
394+ engine : JSONEngine = ...,
392395 lines : bool = ...,
393396 chunksize : int ,
394397 compression : CompressionOptions = ...,
@@ -417,6 +420,7 @@ def read_json(
417420 compression : CompressionOptions = ...,
418421 nrows : int | None = ...,
419422 storage_options : StorageOptions = ...,
423+ engine : JSONEngine = ...,
420424) -> JsonReader [Literal ["series" ]]:
421425 ...
422426
@@ -440,6 +444,7 @@ def read_json(
440444 compression : CompressionOptions = ...,
441445 nrows : int | None = ...,
442446 storage_options : StorageOptions = ...,
447+ engine : JSONEngine = ...,
443448) -> Series :
444449 ...
445450
@@ -463,6 +468,7 @@ def read_json(
463468 compression : CompressionOptions = ...,
464469 nrows : int | None = ...,
465470 storage_options : StorageOptions = ...,
471+ engine : JSONEngine = ...,
466472) -> DataFrame :
467473 ...
468474
@@ -489,6 +495,7 @@ def read_json(
489495 compression : CompressionOptions = "infer" ,
490496 nrows : int | None = None ,
491497 storage_options : StorageOptions = None ,
498+ engine : JSONEngine = "ujson" ,
492499) -> DataFrame | Series | JsonReader :
493500 """
494501 Convert a JSON string to pandas object.
@@ -597,6 +604,9 @@ def read_json(
597604
598605 .. versionadded:: 1.3.0
599606
607+ engine : {{'ujson', 'pyarrow'}}, default "ujson"
608+ Parser engine to use.
609+
600610 lines : bool, default False
601611 Read the file as a json object per line.
602612
@@ -738,6 +748,7 @@ def read_json(
738748 nrows = nrows ,
739749 storage_options = storage_options ,
740750 encoding_errors = encoding_errors ,
751+ engine = engine ,
741752 )
742753
743754 if chunksize :
@@ -773,6 +784,7 @@ def __init__(
773784 nrows : int | None ,
774785 storage_options : StorageOptions = None ,
775786 encoding_errors : str | None = "strict" ,
787+ engine : JSONEngine = "ujson" ,
776788 ) -> None :
777789
778790 self .orient = orient
@@ -784,6 +796,7 @@ def __init__(
784796 self .precise_float = precise_float
785797 self .date_unit = date_unit
786798 self .encoding = encoding
799+ self .engine = engine
787800 self .compression = compression
788801 self .storage_options = storage_options
789802 self .lines = lines
@@ -801,9 +814,48 @@ def __init__(
801814 self .nrows = validate_integer ("nrows" , self .nrows , 0 )
802815 if not self .lines :
803816 raise ValueError ("nrows can only be passed if lines=True" )
817+ if self .engine == "pyarrow" :
818+ if not self .lines :
819+ raise ValueError (
820+ "currently pyarrow engine only supports "
821+ "the line-delimited JSON format"
822+ )
823+ if self .engine not in ["pyarrow" , "ujson" ]:
824+ raise ValueError (
825+ f"The engine type { self .engine } is currently not supported."
826+ )
827+
828+ if self .engine == "pyarrow" :
829+ self ._engine = self ._make_engine (filepath_or_buffer )
830+ if self .engine == "ujson" :
831+ data = self ._get_data_from_filepath (filepath_or_buffer )
832+ self .data = self ._preprocess_data (data )
833+
834+ def _make_engine (
835+ self ,
836+ filepath_or_buffer : FilePath | ReadBuffer [str ] | ReadBuffer [bytes ],
837+ ) -> ArrowJsonParserWrapper :
838+
839+ if not isinstance (filepath_or_buffer , list ):
840+ is_text = False
841+ mode = "rb"
842+ self .handles = get_handle (
843+ self ._get_data_from_filepath (filepath_or_buffer ),
844+ mode = mode ,
845+ encoding = self .encoding ,
846+ is_text = is_text ,
847+ compression = self .compression ,
848+ storage_options = self .storage_options ,
849+ errors = self .encoding_errors ,
850+ )
851+ filepath_or_buffer = self .handles .handle
804852
805- data = self ._get_data_from_filepath (filepath_or_buffer )
806- self .data = self ._preprocess_data (data )
853+ try :
854+ return ArrowJsonParserWrapper (filepath_or_buffer )
855+ except Exception :
856+ if self .handles is not None :
857+ self .handles .close ()
858+ raise
807859
808860 def _preprocess_data (self , data ):
809861 """
@@ -888,19 +940,22 @@ def read(self) -> DataFrame | Series:
888940 """
889941 obj : DataFrame | Series
890942 with self :
891- if self .lines :
892- if self .chunksize :
893- obj = concat (self )
894- elif self .nrows :
895- lines = list (islice (self .data , self .nrows ))
896- lines_json = self ._combine_lines (lines )
897- obj = self ._get_object_parser (lines_json )
943+ if self .engine == "pyarrow" :
944+ obj = self ._engine .read ()
945+ if self .engine == "ujson" :
946+ if self .lines :
947+ if self .chunksize :
948+ obj = concat (self )
949+ elif self .nrows :
950+ lines = list (islice (self .data , self .nrows ))
951+ lines_json = self ._combine_lines (lines )
952+ obj = self ._get_object_parser (lines_json )
953+ else :
954+ data = ensure_str (self .data )
955+ data_lines = data .split ("\n " )
956+ obj = self ._get_object_parser (self ._combine_lines (data_lines ))
898957 else :
899- data = ensure_str (self .data )
900- data_lines = data .split ("\n " )
901- obj = self ._get_object_parser (self ._combine_lines (data_lines ))
902- else :
903- obj = self ._get_object_parser (self .data )
958+ obj = self ._get_object_parser (self .data )
904959 return obj
905960
906961 def _get_object_parser (self , json ) -> DataFrame | Series :
0 commit comments