Skip to content

Commit bdfdc79

Browse files
tianchenemkornfield
andcommitted
ARROW-7490: [Java] Avro converter should convert attributes and props to FieldType metadata
Related to [ARROW-7490](https://issues.apache.org/jira/browse/ARROW-7490). Currently in Avro converter, some attributes are used when creating vectors such as “name”, “size” etc, others are discarded. For named type like Record, Enum and Fixed, they may have attributes like “doc” “aliased” which should keep in metadata for potential further use. Besides, properties are also not converted properly in some cases. Closes apache#6119 from tianchen92/ARROW-7490 and squashes the following commits: d1ebc28 <emkornfield> Merge branch 'master' into ARROW-7490 f1d0d9a <tianchen> ARROW-7490: Avro converter should convert attributes and props to FieldType metadata Lead-authored-by: tianchen <niki.lj@alibaba-inc.com> Co-authored-by: emkornfield <emkornfield@gmail.com> Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
1 parent b178e15 commit bdfdc79

5 files changed

Lines changed: 273 additions & 57 deletions

File tree

java/adapter/avro/src/main/java/org/apache/arrow/AvroToArrowUtils.java

Lines changed: 116 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
import java.nio.charset.StandardCharsets;
2626
import java.util.ArrayList;
2727
import java.util.Arrays;
28-
import java.util.Collections;
2928
import java.util.HashMap;
3029
import java.util.List;
3130
import java.util.Map;
@@ -91,6 +90,7 @@
9190
import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
9291
import org.apache.arrow.vector.types.pojo.Field;
9392
import org.apache.arrow.vector.types.pojo.FieldType;
93+
import org.apache.arrow.vector.util.JsonStringArrayList;
9494
import org.apache.avro.LogicalType;
9595
import org.apache.avro.LogicalTypes;
9696
import org.apache.avro.Schema;
@@ -188,14 +188,15 @@ private static Consumer createConsumer(
188188
consumer = new AvroStringConsumer((VarCharVector) vector);
189189
break;
190190
case FIXED:
191+
Map<String, String> extProps = createExternalProps(schema);
191192
if (logicalType instanceof LogicalTypes.Decimal) {
192193
arrowType = createDecimalArrowType((LogicalTypes.Decimal) logicalType);
193-
fieldType = new FieldType(nullable, arrowType, /*dictionary=*/null, getMetaData(schema));
194+
fieldType = new FieldType(nullable, arrowType, /*dictionary=*/null, getMetaData(schema, extProps));
194195
vector = createVector(consumerVector, fieldType, name, allocator);
195196
consumer = new AvroDecimalConsumer.FixedDecimalConsumer((DecimalVector) vector, schema.getFixedSize());
196197
} else {
197198
arrowType = new ArrowType.FixedSizeBinary(schema.getFixedSize());
198-
fieldType = new FieldType(nullable, arrowType, /*dictionary=*/null, getMetaData(schema));
199+
fieldType = new FieldType(nullable, arrowType, /*dictionary=*/null, getMetaData(schema, extProps));
199200
vector = createVector(consumerVector, fieldType, name, allocator);
200201
consumer = new AvroFixedConsumer((FixedSizeBinaryVector) vector, schema.getFixedSize());
201202
}
@@ -417,31 +418,34 @@ private static String getDefaultFieldName(ArrowType type) {
417418
}
418419

419420
private static Field avroSchemaToField(Schema schema, String name, AvroToArrowConfig config) {
421+
return avroSchemaToField(schema, name, config, null);
422+
}
423+
424+
private static Field avroSchemaToField(
425+
Schema schema,
426+
String name,
427+
AvroToArrowConfig config,
428+
Map<String, String> externalProps) {
429+
420430
final Type type = schema.getType();
421431
final LogicalType logicalType = schema.getLogicalType();
422-
final ArrowType arrowType;
432+
final List<Field> children = new ArrayList<>();
433+
final FieldType fieldType;
423434

424435
switch (type) {
425436
case UNION:
426-
List<Field> children = new ArrayList<>();
427437
for (int i = 0; i < schema.getTypes().size(); i++) {
428438
Schema childSchema = schema.getTypes().get(i);
429439
// Union child vector should use default name
430440
children.add(avroSchemaToField(childSchema, null, config));
431441
}
432-
arrowType = new ArrowType.Union(UnionMode.Sparse, null);
433-
if (name == null) {
434-
name = getDefaultFieldName(arrowType);
435-
}
436-
return new Field(name, FieldType.nullable(arrowType), children);
442+
fieldType = createFieldType(new ArrowType.Union(UnionMode.Sparse, null), schema, externalProps);
443+
break;
437444
case ARRAY:
438445
Schema elementSchema = schema.getElementType();
439-
arrowType = new ArrowType.List();
440-
if (name == null) {
441-
name = getDefaultFieldName(arrowType);
442-
}
443-
return new Field(name, FieldType.nullable(arrowType),
444-
Collections.singletonList(avroSchemaToField(elementSchema, elementSchema.getName(), config)));
446+
children.add(avroSchemaToField(elementSchema, elementSchema.getName(), config));
447+
fieldType = createFieldType(new ArrowType.List(), schema, externalProps);
448+
break;
445449
case MAP:
446450
// MapVector internal struct field and key field should be non-nullable
447451
FieldType keyFieldType = new FieldType(/*nullable=*/false, new ArrowType.Utf8(), /*dictionary=*/null);
@@ -450,95 +454,106 @@ private static Field avroSchemaToField(Schema schema, String name, AvroToArrowCo
450454

451455
FieldType structFieldType = new FieldType(false, new ArrowType.Struct(), /*dictionary=*/null);
452456
Field structField = new Field("internal", structFieldType, Arrays.asList(keyField, valueField));
453-
arrowType = new ArrowType.Map(/*keySorted=*/false);
454-
if (name == null) {
455-
name = getDefaultFieldName(arrowType);
456-
}
457-
return new Field(name, FieldType.nullable(arrowType), Collections.singletonList(structField));
457+
children.add(structField);
458+
fieldType = createFieldType(new ArrowType.Map(/*keySorted=*/false), schema, externalProps);
459+
break;
458460
case RECORD:
459-
List<Field> childFields = new ArrayList<>();
460461
final Set<String> skipFieldNames = config.getSkipFieldNames();
461462
for (int i = 0; i < schema.getFields().size(); i++) {
462463
final Schema.Field field = schema.getFields().get(i);
463464
Schema childSchema = field.schema();
464465
String fullChildName = String.format("%s.%s", name, field.name());
465466
if (!skipFieldNames.contains(fullChildName)) {
466-
childFields.add(avroSchemaToField(childSchema, fullChildName, config));
467+
final Map<String, String> extProps = new HashMap<>();
468+
String doc = field.doc();
469+
Set<String> aliases = field.aliases();
470+
if (doc != null) {
471+
extProps.put("doc", doc);
472+
}
473+
if (aliases != null) {
474+
extProps.put("aliases", convertAliases(aliases));
475+
}
476+
children.add(avroSchemaToField(childSchema, fullChildName, config, extProps));
467477
}
468478
}
469-
arrowType = new ArrowType.Struct();
470-
if (name == null) {
471-
name = getDefaultFieldName(arrowType);
472-
}
473-
return new Field(name, FieldType.nullable(arrowType), childFields);
479+
fieldType = createFieldType(new ArrowType.Struct(), schema, externalProps);
480+
break;
474481
case ENUM:
475482
DictionaryProvider.MapDictionaryProvider provider = config.getProvider();
476483
int current = provider.getDictionaryIds().size();
477484
int enumCount = schema.getEnumSymbols().size();
478485
ArrowType.Int indexType = DictionaryEncoder.getIndexType(enumCount);
479-
FieldType indexFieldType = new FieldType(true, indexType,
486+
487+
fieldType = createFieldType(indexType, schema, externalProps,
480488
new DictionaryEncoding(current, /*ordered=*/false, /*indexType=*/indexType));
481-
return new Field(name, indexFieldType, null);
489+
break;
482490

483491
case STRING:
484-
arrowType = new ArrowType.Utf8();
492+
fieldType = createFieldType(new ArrowType.Utf8(), schema, externalProps);
485493
break;
486494
case FIXED:
495+
final ArrowType fixedArrowType;
487496
if (logicalType instanceof LogicalTypes.Decimal) {
488-
arrowType = createDecimalArrowType((LogicalTypes.Decimal) logicalType);
497+
fixedArrowType = createDecimalArrowType((LogicalTypes.Decimal) logicalType);
489498
} else {
490-
arrowType = new ArrowType.FixedSizeBinary(schema.getFixedSize());
499+
fixedArrowType = new ArrowType.FixedSizeBinary(schema.getFixedSize());
491500
}
501+
fieldType = createFieldType(fixedArrowType, schema, externalProps);
492502
break;
493503
case INT:
504+
final ArrowType intArrowType;
494505
if (logicalType instanceof LogicalTypes.Date) {
495-
arrowType = new ArrowType.Date(DateUnit.DAY);
506+
intArrowType = new ArrowType.Date(DateUnit.DAY);
496507
} else if (logicalType instanceof LogicalTypes.TimeMillis) {
497-
arrowType = new ArrowType.Time(TimeUnit.MILLISECOND, 32);
508+
intArrowType = new ArrowType.Time(TimeUnit.MILLISECOND, 32);
498509
} else {
499-
arrowType = new ArrowType.Int(32, /*signed=*/true);
510+
intArrowType = new ArrowType.Int(32, /*signed=*/true);
500511
}
512+
fieldType = createFieldType(intArrowType, schema, externalProps);
501513
break;
502514
case BOOLEAN:
503-
arrowType = new ArrowType.Bool();
515+
fieldType = createFieldType(new ArrowType.Bool(), schema, externalProps);
504516
break;
505517
case LONG:
518+
final ArrowType longArrowType;
506519
if (logicalType instanceof LogicalTypes.TimeMicros) {
507-
arrowType = new ArrowType.Time(TimeUnit.MICROSECOND, 64);
520+
longArrowType = new ArrowType.Time(TimeUnit.MICROSECOND, 64);
508521
} else if (logicalType instanceof LogicalTypes.TimestampMillis) {
509-
arrowType = new ArrowType.Timestamp(TimeUnit.MILLISECOND, null);
522+
longArrowType = new ArrowType.Timestamp(TimeUnit.MILLISECOND, null);
510523
} else if (logicalType instanceof LogicalTypes.TimestampMicros) {
511-
arrowType = new ArrowType.Timestamp(TimeUnit.MICROSECOND, null);
524+
longArrowType = new ArrowType.Timestamp(TimeUnit.MICROSECOND, null);
512525
} else {
513-
arrowType = new ArrowType.Int(64, /*signed=*/true);
526+
longArrowType = new ArrowType.Int(64, /*signed=*/true);
514527
}
528+
fieldType = createFieldType(longArrowType, schema, externalProps);
515529
break;
516530
case FLOAT:
517-
arrowType = new ArrowType.FloatingPoint(SINGLE);
531+
fieldType = createFieldType(new ArrowType.FloatingPoint(SINGLE), schema, externalProps);
518532
break;
519533
case DOUBLE:
520-
arrowType = new ArrowType.FloatingPoint(DOUBLE);
534+
fieldType = createFieldType(new ArrowType.FloatingPoint(DOUBLE), schema, externalProps);
521535
break;
522536
case BYTES:
537+
final ArrowType bytesArrowType;
523538
if (logicalType instanceof LogicalTypes.Decimal) {
524-
arrowType = createDecimalArrowType((LogicalTypes.Decimal) logicalType);
539+
bytesArrowType = createDecimalArrowType((LogicalTypes.Decimal) logicalType);
525540
} else {
526-
arrowType = new ArrowType.Binary();
541+
bytesArrowType = new ArrowType.Binary();
527542
}
528-
543+
fieldType = createFieldType(bytesArrowType, schema, externalProps);
529544
break;
530545
case NULL:
531-
arrowType = new ArrowType.Null();
546+
fieldType = createFieldType(ArrowType.Null.INSTANCE, schema, externalProps);
532547
break;
533548
default:
534549
// no-op, shouldn't get here
535550
throw new UnsupportedOperationException();
536551
}
537552

538553
if (name == null) {
539-
name = getDefaultFieldName(arrowType);
554+
name = getDefaultFieldName(fieldType.getType());
540555
}
541-
return Field.nullable(name, arrowType);
556+
return new Field(name, fieldType, children.size() == 0 ? null : children);
542557
}
543558

544559
private static Consumer createArrayConsumer(Schema schema, String name, AvroToArrowConfig config,
@@ -568,7 +583,7 @@ private static Consumer createStructConsumer(Schema schema, String name, AvroToA
568583

569584
StructVector structVector;
570585
if (consumerVector == null) {
571-
final Field field = avroSchemaToField(schema, name, config);
586+
final Field field = avroSchemaToField(schema, name, config, createExternalProps(schema));
572587
structVector = (StructVector) field.createVector(config.getAllocator());
573588
} else {
574589
structVector = (StructVector) consumerVector;
@@ -600,7 +615,7 @@ private static Consumer createEnumConsumer(Schema schema, String name, AvroToArr
600615

601616
BaseIntVector indexVector;
602617
if (consumerVector == null) {
603-
final Field field = avroSchemaToField(schema, name, config);
618+
final Field field = avroSchemaToField(schema, name, config, createExternalProps(schema));
604619
indexVector = (BaseIntVector) field.createVector(config.getAllocator());
605620
} else {
606621
indexVector = (BaseIntVector) consumerVector;
@@ -676,12 +691,6 @@ private static Consumer createUnionConsumer(Schema schema, String name, AvroToAr
676691
return new AvroUnionsConsumer(unionVector, delegates, types);
677692
}
678693

679-
private static Map<String, String> getMetaData(Schema schema) {
680-
Map<String, String> metadata = new HashMap<>();
681-
schema.getObjectProps().forEach((k,v) -> metadata.put(k, v.toString()));
682-
return metadata;
683-
}
684-
685694
/**
686695
* Read data from {@link Decoder} and generate a {@link VectorSchemaRoot}.
687696
* @param schema avro schema
@@ -740,4 +749,54 @@ static VectorSchemaRoot avroToArrowVectors(
740749

741750
return root;
742751
}
752+
753+
private static Map<String, String> getMetaData(Schema schema) {
754+
Map<String, String> metadata = new HashMap<>();
755+
schema.getObjectProps().forEach((k,v) -> metadata.put(k, v.toString()));
756+
return metadata;
757+
}
758+
759+
private static Map<String, String> getMetaData(Schema schema, Map<String, String> externalProps) {
760+
Map<String, String> metadata = getMetaData(schema);
761+
if (externalProps != null) {
762+
metadata.putAll(externalProps);
763+
}
764+
return metadata;
765+
}
766+
767+
/**
768+
* Parse avro attributes and convert them to metadata.
769+
*/
770+
private static Map<String, String> createExternalProps(Schema schema) {
771+
final Map<String, String> extProps = new HashMap<>();
772+
String doc = schema.getDoc();
773+
Set<String> aliases = schema.getAliases();
774+
if (doc != null) {
775+
extProps.put("doc", doc);
776+
}
777+
if (aliases != null) {
778+
extProps.put("aliases", convertAliases(aliases));
779+
}
780+
return extProps;
781+
}
782+
783+
private static FieldType createFieldType(ArrowType arrowType, Schema schema, Map<String, String> externalProps) {
784+
return createFieldType(arrowType, schema, externalProps, /*dictionary=*/null);
785+
}
786+
787+
private static FieldType createFieldType(
788+
ArrowType arrowType,
789+
Schema schema,
790+
Map<String, String> externalProps,
791+
DictionaryEncoding dictionary) {
792+
793+
return new FieldType(/*nullable=*/false, arrowType, dictionary,
794+
getMetaData(schema, externalProps));
795+
}
796+
797+
private static String convertAliases(Set<String> aliases) {
798+
JsonStringArrayList jsonList = new JsonStringArrayList();
799+
aliases.stream().forEach(a -> jsonList.add(a));
800+
return jsonList.toString();
801+
}
743802
}

java/adapter/avro/src/test/java/org/apache/arrow/AvroToArrowTest.java

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,14 @@
2525
import java.util.Arrays;
2626
import java.util.LinkedHashMap;
2727
import java.util.List;
28+
import java.util.Map;
2829

2930
import org.apache.arrow.vector.FieldVector;
3031
import org.apache.arrow.vector.VarCharVector;
3132
import org.apache.arrow.vector.VectorSchemaRoot;
3233
import org.apache.arrow.vector.complex.ListVector;
3334
import org.apache.arrow.vector.complex.MapVector;
35+
import org.apache.arrow.vector.complex.StructVector;
3436
import org.apache.avro.Schema;
3537
import org.apache.avro.generic.GenericData;
3638
import org.apache.avro.generic.GenericRecord;
@@ -80,6 +82,76 @@ public void testRecordType() throws Exception {
8082
checkRecordResult(schema, data, root);
8183
}
8284

85+
@Test
86+
public void testFixedAttributes() throws Exception {
87+
Schema schema = getSchema("attrs/test_fixed_attr.avsc");
88+
89+
List<GenericData.Fixed> data = new ArrayList<>();
90+
List<byte[]> expected = new ArrayList<>();
91+
for (int i = 0; i < 5; i++) {
92+
byte[] value = ("value" + i).getBytes(StandardCharsets.UTF_8);
93+
expected.add(value);
94+
GenericData.Fixed fixed = new GenericData.Fixed(schema);
95+
fixed.bytes(value);
96+
data.add(fixed);
97+
}
98+
99+
VectorSchemaRoot root = writeAndRead(schema, data);
100+
FieldVector vector = root.getFieldVectors().get(0);
101+
102+
Map<String, String> metadata = vector.getField().getMetadata();
103+
assertEquals("fixed doc", metadata.get("doc"));
104+
assertEquals("[\"alias1\",\"alias2\"]", metadata.get("aliases"));
105+
}
106+
107+
@Test
108+
public void testEnumAttributes() throws Exception {
109+
Schema schema = getSchema("attrs/test_enum_attrs.avsc");
110+
List<GenericData.EnumSymbol> data = Arrays.asList(
111+
new GenericData.EnumSymbol(schema, "SPADES"),
112+
new GenericData.EnumSymbol(schema, "HEARTS"),
113+
new GenericData.EnumSymbol(schema, "DIAMONDS"),
114+
new GenericData.EnumSymbol(schema, "CLUBS"),
115+
new GenericData.EnumSymbol(schema, "SPADES"));
116+
117+
VectorSchemaRoot root = writeAndRead(schema, data);
118+
FieldVector vector = root.getFieldVectors().get(0);
119+
120+
Map<String, String> metadata = vector.getField().getMetadata();
121+
assertEquals("enum doc", metadata.get("doc"));
122+
assertEquals("[\"alias1\",\"alias2\"]", metadata.get("aliases"));
123+
}
124+
125+
@Test
126+
public void testRecordAttributes() throws Exception {
127+
Schema schema = getSchema("attrs/test_record_attrs.avsc");
128+
Schema nestedSchema = schema.getFields().get(0).schema();
129+
ArrayList<GenericRecord> data = new ArrayList<>();
130+
for (int i = 0; i < 5; i++) {
131+
GenericRecord record = new GenericData.Record(schema);
132+
GenericRecord nestedRecord = new GenericData.Record(nestedSchema);
133+
nestedRecord.put(0, "test" + i);
134+
nestedRecord.put(1, i);
135+
record.put(0, nestedRecord);
136+
137+
data.add(record);
138+
}
139+
140+
VectorSchemaRoot root = writeAndRead(schema, data);
141+
142+
StructVector structVector = (StructVector) root.getFieldVectors().get(0);
143+
Map<String, String> structMeta = structVector.getField().getMetadata();
144+
Map<String, String> childMeta1 = structVector.getChildByOrdinal(0).getField().getMetadata();
145+
Map<String, String> childMeta2 = structVector.getChildByOrdinal(1).getField().getMetadata();
146+
147+
assertEquals("f0 doc", structMeta.get("doc"));
148+
assertEquals("[\"f0.a1\"]", structMeta.get("aliases"));
149+
assertEquals("f1 doc", childMeta1.get("doc"));
150+
assertEquals("[\"f1.a1\",\"f1.a2\"]", childMeta1.get("aliases"));
151+
assertEquals("f2 doc", childMeta2.get("doc"));
152+
assertEquals("[\"f2.a1\",\"f2.a2\"]", childMeta2.get("aliases"));
153+
}
154+
83155
@Test
84156
public void testNestedRecordType() throws Exception {
85157
Schema schema = getSchema("test_nested_record.avsc");

0 commit comments

Comments
 (0)