2525import java .nio .charset .StandardCharsets ;
2626import java .util .ArrayList ;
2727import java .util .Arrays ;
28- import java .util .Collections ;
2928import java .util .HashMap ;
3029import java .util .List ;
3130import java .util .Map ;
9190import org .apache .arrow .vector .types .pojo .DictionaryEncoding ;
9291import org .apache .arrow .vector .types .pojo .Field ;
9392import org .apache .arrow .vector .types .pojo .FieldType ;
93+ import org .apache .arrow .vector .util .JsonStringArrayList ;
9494import org .apache .avro .LogicalType ;
9595import org .apache .avro .LogicalTypes ;
9696import org .apache .avro .Schema ;
@@ -188,14 +188,15 @@ private static Consumer createConsumer(
188188 consumer = new AvroStringConsumer ((VarCharVector ) vector );
189189 break ;
190190 case FIXED :
191+ Map <String , String > extProps = createExternalProps (schema );
191192 if (logicalType instanceof LogicalTypes .Decimal ) {
192193 arrowType = createDecimalArrowType ((LogicalTypes .Decimal ) logicalType );
193- fieldType = new FieldType (nullable , arrowType , /*dictionary=*/ null , getMetaData (schema ));
194+ fieldType = new FieldType (nullable , arrowType , /*dictionary=*/ null , getMetaData (schema , extProps ));
194195 vector = createVector (consumerVector , fieldType , name , allocator );
195196 consumer = new AvroDecimalConsumer .FixedDecimalConsumer ((DecimalVector ) vector , schema .getFixedSize ());
196197 } else {
197198 arrowType = new ArrowType .FixedSizeBinary (schema .getFixedSize ());
198- fieldType = new FieldType (nullable , arrowType , /*dictionary=*/ null , getMetaData (schema ));
199+ fieldType = new FieldType (nullable , arrowType , /*dictionary=*/ null , getMetaData (schema , extProps ));
199200 vector = createVector (consumerVector , fieldType , name , allocator );
200201 consumer = new AvroFixedConsumer ((FixedSizeBinaryVector ) vector , schema .getFixedSize ());
201202 }
@@ -417,31 +418,34 @@ private static String getDefaultFieldName(ArrowType type) {
417418 }
418419
419420 private static Field avroSchemaToField (Schema schema , String name , AvroToArrowConfig config ) {
421+ return avroSchemaToField (schema , name , config , null );
422+ }
423+
424+ private static Field avroSchemaToField (
425+ Schema schema ,
426+ String name ,
427+ AvroToArrowConfig config ,
428+ Map <String , String > externalProps ) {
429+
420430 final Type type = schema .getType ();
421431 final LogicalType logicalType = schema .getLogicalType ();
422- final ArrowType arrowType ;
432+ final List <Field > children = new ArrayList <>();
433+ final FieldType fieldType ;
423434
424435 switch (type ) {
425436 case UNION :
426- List <Field > children = new ArrayList <>();
427437 for (int i = 0 ; i < schema .getTypes ().size (); i ++) {
428438 Schema childSchema = schema .getTypes ().get (i );
429439 // Union child vector should use default name
430440 children .add (avroSchemaToField (childSchema , null , config ));
431441 }
432- arrowType = new ArrowType .Union (UnionMode .Sparse , null );
433- if (name == null ) {
434- name = getDefaultFieldName (arrowType );
435- }
436- return new Field (name , FieldType .nullable (arrowType ), children );
442+ fieldType = createFieldType (new ArrowType .Union (UnionMode .Sparse , null ), schema , externalProps );
443+ break ;
437444 case ARRAY :
438445 Schema elementSchema = schema .getElementType ();
439- arrowType = new ArrowType .List ();
440- if (name == null ) {
441- name = getDefaultFieldName (arrowType );
442- }
443- return new Field (name , FieldType .nullable (arrowType ),
444- Collections .singletonList (avroSchemaToField (elementSchema , elementSchema .getName (), config )));
446+ children .add (avroSchemaToField (elementSchema , elementSchema .getName (), config ));
447+ fieldType = createFieldType (new ArrowType .List (), schema , externalProps );
448+ break ;
445449 case MAP :
446450 // MapVector internal struct field and key field should be non-nullable
447451 FieldType keyFieldType = new FieldType (/*nullable=*/ false , new ArrowType .Utf8 (), /*dictionary=*/ null );
@@ -450,95 +454,106 @@ private static Field avroSchemaToField(Schema schema, String name, AvroToArrowCo
450454
451455 FieldType structFieldType = new FieldType (false , new ArrowType .Struct (), /*dictionary=*/ null );
452456 Field structField = new Field ("internal" , structFieldType , Arrays .asList (keyField , valueField ));
453- arrowType = new ArrowType .Map (/*keySorted=*/ false );
454- if (name == null ) {
455- name = getDefaultFieldName (arrowType );
456- }
457- return new Field (name , FieldType .nullable (arrowType ), Collections .singletonList (structField ));
457+ children .add (structField );
458+ fieldType = createFieldType (new ArrowType .Map (/*keySorted=*/ false ), schema , externalProps );
459+ break ;
458460 case RECORD :
459- List <Field > childFields = new ArrayList <>();
460461 final Set <String > skipFieldNames = config .getSkipFieldNames ();
461462 for (int i = 0 ; i < schema .getFields ().size (); i ++) {
462463 final Schema .Field field = schema .getFields ().get (i );
463464 Schema childSchema = field .schema ();
464465 String fullChildName = String .format ("%s.%s" , name , field .name ());
465466 if (!skipFieldNames .contains (fullChildName )) {
466- childFields .add (avroSchemaToField (childSchema , fullChildName , config ));
467+ final Map <String , String > extProps = new HashMap <>();
468+ String doc = field .doc ();
469+ Set <String > aliases = field .aliases ();
470+ if (doc != null ) {
471+ extProps .put ("doc" , doc );
472+ }
473+ if (aliases != null ) {
474+ extProps .put ("aliases" , convertAliases (aliases ));
475+ }
476+ children .add (avroSchemaToField (childSchema , fullChildName , config , extProps ));
467477 }
468478 }
469- arrowType = new ArrowType .Struct ();
470- if (name == null ) {
471- name = getDefaultFieldName (arrowType );
472- }
473- return new Field (name , FieldType .nullable (arrowType ), childFields );
479+ fieldType = createFieldType (new ArrowType .Struct (), schema , externalProps );
480+ break ;
474481 case ENUM :
475482 DictionaryProvider .MapDictionaryProvider provider = config .getProvider ();
476483 int current = provider .getDictionaryIds ().size ();
477484 int enumCount = schema .getEnumSymbols ().size ();
478485 ArrowType .Int indexType = DictionaryEncoder .getIndexType (enumCount );
479- FieldType indexFieldType = new FieldType (true , indexType ,
486+
487+ fieldType = createFieldType (indexType , schema , externalProps ,
480488 new DictionaryEncoding (current , /*ordered=*/ false , /*indexType=*/ indexType ));
481- return new Field ( name , indexFieldType , null ) ;
489+ break ;
482490
483491 case STRING :
484- arrowType = new ArrowType .Utf8 ();
492+ fieldType = createFieldType ( new ArrowType .Utf8 (), schema , externalProps );
485493 break ;
486494 case FIXED :
495+ final ArrowType fixedArrowType ;
487496 if (logicalType instanceof LogicalTypes .Decimal ) {
488- arrowType = createDecimalArrowType ((LogicalTypes .Decimal ) logicalType );
497+ fixedArrowType = createDecimalArrowType ((LogicalTypes .Decimal ) logicalType );
489498 } else {
490- arrowType = new ArrowType .FixedSizeBinary (schema .getFixedSize ());
499+ fixedArrowType = new ArrowType .FixedSizeBinary (schema .getFixedSize ());
491500 }
501+ fieldType = createFieldType (fixedArrowType , schema , externalProps );
492502 break ;
493503 case INT :
504+ final ArrowType intArrowType ;
494505 if (logicalType instanceof LogicalTypes .Date ) {
495- arrowType = new ArrowType .Date (DateUnit .DAY );
506+ intArrowType = new ArrowType .Date (DateUnit .DAY );
496507 } else if (logicalType instanceof LogicalTypes .TimeMillis ) {
497- arrowType = new ArrowType .Time (TimeUnit .MILLISECOND , 32 );
508+ intArrowType = new ArrowType .Time (TimeUnit .MILLISECOND , 32 );
498509 } else {
499- arrowType = new ArrowType .Int (32 , /*signed=*/ true );
510+ intArrowType = new ArrowType .Int (32 , /*signed=*/ true );
500511 }
512+ fieldType = createFieldType (intArrowType , schema , externalProps );
501513 break ;
502514 case BOOLEAN :
503- arrowType = new ArrowType .Bool ();
515+ fieldType = createFieldType ( new ArrowType .Bool (), schema , externalProps );
504516 break ;
505517 case LONG :
518+ final ArrowType longArrowType ;
506519 if (logicalType instanceof LogicalTypes .TimeMicros ) {
507- arrowType = new ArrowType .Time (TimeUnit .MICROSECOND , 64 );
520+ longArrowType = new ArrowType .Time (TimeUnit .MICROSECOND , 64 );
508521 } else if (logicalType instanceof LogicalTypes .TimestampMillis ) {
509- arrowType = new ArrowType .Timestamp (TimeUnit .MILLISECOND , null );
522+ longArrowType = new ArrowType .Timestamp (TimeUnit .MILLISECOND , null );
510523 } else if (logicalType instanceof LogicalTypes .TimestampMicros ) {
511- arrowType = new ArrowType .Timestamp (TimeUnit .MICROSECOND , null );
524+ longArrowType = new ArrowType .Timestamp (TimeUnit .MICROSECOND , null );
512525 } else {
513- arrowType = new ArrowType .Int (64 , /*signed=*/ true );
526+ longArrowType = new ArrowType .Int (64 , /*signed=*/ true );
514527 }
528+ fieldType = createFieldType (longArrowType , schema , externalProps );
515529 break ;
516530 case FLOAT :
517- arrowType = new ArrowType .FloatingPoint (SINGLE );
531+ fieldType = createFieldType ( new ArrowType .FloatingPoint (SINGLE ), schema , externalProps );
518532 break ;
519533 case DOUBLE :
520- arrowType = new ArrowType .FloatingPoint (DOUBLE );
534+ fieldType = createFieldType ( new ArrowType .FloatingPoint (DOUBLE ), schema , externalProps );
521535 break ;
522536 case BYTES :
537+ final ArrowType bytesArrowType ;
523538 if (logicalType instanceof LogicalTypes .Decimal ) {
524- arrowType = createDecimalArrowType ((LogicalTypes .Decimal ) logicalType );
539+ bytesArrowType = createDecimalArrowType ((LogicalTypes .Decimal ) logicalType );
525540 } else {
526- arrowType = new ArrowType .Binary ();
541+ bytesArrowType = new ArrowType .Binary ();
527542 }
528-
543+ fieldType = createFieldType ( bytesArrowType , schema , externalProps );
529544 break ;
530545 case NULL :
531- arrowType = new ArrowType .Null ( );
546+ fieldType = createFieldType ( ArrowType .Null . INSTANCE , schema , externalProps );
532547 break ;
533548 default :
534549 // no-op, shouldn't get here
535550 throw new UnsupportedOperationException ();
536551 }
537552
538553 if (name == null ) {
539- name = getDefaultFieldName (arrowType );
554+ name = getDefaultFieldName (fieldType . getType () );
540555 }
541- return Field . nullable (name , arrowType );
556+ return new Field (name , fieldType , children . size () == 0 ? null : children );
542557 }
543558
544559 private static Consumer createArrayConsumer (Schema schema , String name , AvroToArrowConfig config ,
@@ -568,7 +583,7 @@ private static Consumer createStructConsumer(Schema schema, String name, AvroToA
568583
569584 StructVector structVector ;
570585 if (consumerVector == null ) {
571- final Field field = avroSchemaToField (schema , name , config );
586+ final Field field = avroSchemaToField (schema , name , config , createExternalProps ( schema ) );
572587 structVector = (StructVector ) field .createVector (config .getAllocator ());
573588 } else {
574589 structVector = (StructVector ) consumerVector ;
@@ -600,7 +615,7 @@ private static Consumer createEnumConsumer(Schema schema, String name, AvroToArr
600615
601616 BaseIntVector indexVector ;
602617 if (consumerVector == null ) {
603- final Field field = avroSchemaToField (schema , name , config );
618+ final Field field = avroSchemaToField (schema , name , config , createExternalProps ( schema ) );
604619 indexVector = (BaseIntVector ) field .createVector (config .getAllocator ());
605620 } else {
606621 indexVector = (BaseIntVector ) consumerVector ;
@@ -676,12 +691,6 @@ private static Consumer createUnionConsumer(Schema schema, String name, AvroToAr
676691 return new AvroUnionsConsumer (unionVector , delegates , types );
677692 }
678693
679- private static Map <String , String > getMetaData (Schema schema ) {
680- Map <String , String > metadata = new HashMap <>();
681- schema .getObjectProps ().forEach ((k ,v ) -> metadata .put (k , v .toString ()));
682- return metadata ;
683- }
684-
685694 /**
686695 * Read data from {@link Decoder} and generate a {@link VectorSchemaRoot}.
687696 * @param schema avro schema
@@ -740,4 +749,54 @@ static VectorSchemaRoot avroToArrowVectors(
740749
741750 return root ;
742751 }
752+
753+ private static Map <String , String > getMetaData (Schema schema ) {
754+ Map <String , String > metadata = new HashMap <>();
755+ schema .getObjectProps ().forEach ((k ,v ) -> metadata .put (k , v .toString ()));
756+ return metadata ;
757+ }
758+
759+ private static Map <String , String > getMetaData (Schema schema , Map <String , String > externalProps ) {
760+ Map <String , String > metadata = getMetaData (schema );
761+ if (externalProps != null ) {
762+ metadata .putAll (externalProps );
763+ }
764+ return metadata ;
765+ }
766+
767+ /**
768+ * Parse avro attributes and convert them to metadata.
769+ */
770+ private static Map <String , String > createExternalProps (Schema schema ) {
771+ final Map <String , String > extProps = new HashMap <>();
772+ String doc = schema .getDoc ();
773+ Set <String > aliases = schema .getAliases ();
774+ if (doc != null ) {
775+ extProps .put ("doc" , doc );
776+ }
777+ if (aliases != null ) {
778+ extProps .put ("aliases" , convertAliases (aliases ));
779+ }
780+ return extProps ;
781+ }
782+
783+ private static FieldType createFieldType (ArrowType arrowType , Schema schema , Map <String , String > externalProps ) {
784+ return createFieldType (arrowType , schema , externalProps , /*dictionary=*/ null );
785+ }
786+
787+ private static FieldType createFieldType (
788+ ArrowType arrowType ,
789+ Schema schema ,
790+ Map <String , String > externalProps ,
791+ DictionaryEncoding dictionary ) {
792+
793+ return new FieldType (/*nullable=*/ false , arrowType , dictionary ,
794+ getMetaData (schema , externalProps ));
795+ }
796+
797+ private static String convertAliases (Set <String > aliases ) {
798+ JsonStringArrayList jsonList = new JsonStringArrayList ();
799+ aliases .stream ().forEach (a -> jsonList .add (a ));
800+ return jsonList .toString ();
801+ }
743802}
0 commit comments