Skip to content

Commit 6789465

Browse files
lidavidmbkietz
authored andcommitted
ARROW-9749: [C++][GLib][Python][R][Ruby][Dataset] Introduce FragmentScanOptions, consolidate ScanContext/ScanOptions
- ScanContext/ScanOptions have been merged, since they were essentially always passed together. - For scan options that are specific to a scan (e.g. CSV conversion options), a new FragmentScanOptions has been added to ScanOptions. Currently only CSV has this and it only wraps csv::ConvertOptions; follow up issues can tackle the rest. - GLib, Python, R, and Ruby bindings have been updated. Closes apache#9686 from lidavidm/arrow-9749 Authored-by: David Li <li.davidm96@gmail.com> Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
1 parent 1b7e396 commit 6789465

30 files changed

Lines changed: 238 additions & 475 deletions

c_glib/arrow-dataset-glib/scanner.cpp

Lines changed: 21 additions & 200 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@ G_BEGIN_DECLS
3838
* @title: Scanner classes
3939
* @include: arrow-dataset-glib/arrow-dataset-glib.h
4040
*
41-
* #GADScanContext is a class for a scan context.
42-
*
4341
* #GADScanOptions is a class for a set of scan options.
4442
*
4543
* #GADScanTask is an abstract class for a scan task.
@@ -49,131 +47,6 @@ G_BEGIN_DECLS
4947
* Since: 1.0.0
5048
*/
5149

52-
/* arrow::dataset::ScanContext */
53-
54-
typedef struct GADScanContextPrivate_ {
55-
std::shared_ptr<arrow::dataset::ScanContext> scan_context;
56-
} GADScanContextPrivate;
57-
58-
enum {
59-
PROP_SCAN_CONTEXT = 1,
60-
PROP_USE_THREADS,
61-
};
62-
63-
G_DEFINE_TYPE_WITH_PRIVATE(GADScanContext,
64-
gad_scan_context,
65-
G_TYPE_OBJECT)
66-
67-
#define GAD_SCAN_CONTEXT_GET_PRIVATE(obj) \
68-
static_cast<GADScanContextPrivate *>( \
69-
gad_scan_context_get_instance_private( \
70-
GAD_SCAN_CONTEXT(obj)))
71-
72-
static void
73-
gad_scan_context_finalize(GObject *object)
74-
{
75-
auto priv = GAD_SCAN_CONTEXT_GET_PRIVATE(object);
76-
77-
priv->scan_context.~shared_ptr();
78-
79-
G_OBJECT_CLASS(gad_scan_context_parent_class)->finalize(object);
80-
}
81-
82-
static void
83-
gad_scan_context_set_property(GObject *object,
84-
guint prop_id,
85-
const GValue *value,
86-
GParamSpec *pspec)
87-
{
88-
auto priv = GAD_SCAN_CONTEXT_GET_PRIVATE(object);
89-
90-
switch (prop_id) {
91-
case PROP_SCAN_CONTEXT:
92-
priv->scan_context =
93-
*static_cast<std::shared_ptr<arrow::dataset::ScanContext> *>(g_value_get_pointer(value));
94-
break;
95-
case PROP_USE_THREADS:
96-
priv->scan_context->use_threads = g_value_get_boolean(value);
97-
break;
98-
default:
99-
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
100-
break;
101-
}
102-
}
103-
104-
static void
105-
gad_scan_context_get_property(GObject *object,
106-
guint prop_id,
107-
GValue *value,
108-
GParamSpec *pspec)
109-
{
110-
auto priv = GAD_SCAN_CONTEXT_GET_PRIVATE(object);
111-
112-
switch (prop_id) {
113-
case PROP_USE_THREADS:
114-
g_value_set_boolean(value, priv->scan_context->use_threads);
115-
break;
116-
default:
117-
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
118-
break;
119-
}
120-
}
121-
122-
static void
123-
gad_scan_context_init(GADScanContext *object)
124-
{
125-
auto priv = GAD_SCAN_CONTEXT_GET_PRIVATE(object);
126-
new(&priv->scan_context) std::shared_ptr<arrow::dataset::ScanContext>;
127-
}
128-
129-
static void
130-
gad_scan_context_class_init(GADScanContextClass *klass)
131-
{
132-
auto gobject_class = G_OBJECT_CLASS(klass);
133-
134-
gobject_class->finalize = gad_scan_context_finalize;
135-
gobject_class->set_property = gad_scan_context_set_property;
136-
gobject_class->get_property = gad_scan_context_get_property;
137-
138-
auto scan_context = arrow::dataset::ScanContext();
139-
140-
GParamSpec *spec;
141-
spec = g_param_spec_pointer("scan-context",
142-
"ScanContext",
143-
"The raw std::shared<arrow::dataset::ScanContext> *",
144-
static_cast<GParamFlags>(G_PARAM_WRITABLE |
145-
G_PARAM_CONSTRUCT_ONLY));
146-
g_object_class_install_property(gobject_class, PROP_SCAN_CONTEXT, spec);
147-
148-
/**
149-
* GADScanContext:use-threads:
150-
*
151-
* Indicate if the Scanner should make use of a ThreadPool.
152-
*
153-
* Since: 1.0.0
154-
*/
155-
spec = g_param_spec_boolean("use-threads",
156-
"Use threads",
157-
"Indicate if the Scanner should make use of a ThreadPool",
158-
scan_context.use_threads,
159-
static_cast<GParamFlags>(G_PARAM_READWRITE));
160-
g_object_class_install_property(gobject_class, PROP_USE_THREADS, spec);
161-
}
162-
163-
/**
164-
* gad_scan_context_new:
165-
*
166-
* Returns: A newly created #GADScanContext.
167-
*
168-
* Since: 1.0.0
169-
*/
170-
GADScanContext *
171-
gad_scan_context_new(void)
172-
{
173-
auto arrow_scan_context = std::make_shared<arrow::dataset::ScanContext>();
174-
return gad_scan_context_new_raw(&arrow_scan_context);
175-
}
176-
17750
/* arrow::dataset::ScanOptions */
17851

17952
typedef struct GADScanOptionsPrivate_ {
@@ -186,6 +59,7 @@ enum {
18659
PROP_EVALUATOR,
18760
PROP_PROJECTOR,
18861
PROP_BATCH_SIZE,
62+
PROP_USE_THREADS,
18963
};
19064

19165
G_DEFINE_TYPE_WITH_PRIVATE(GADScanOptions,
@@ -223,6 +97,9 @@ gad_scan_options_set_property(GObject *object,
22397
case PROP_BATCH_SIZE:
22498
priv->scan_options->batch_size = g_value_get_int64(value);
22599
break;
100+
case PROP_USE_THREADS:
101+
priv->scan_options->use_threads = g_value_get_boolean(value);
102+
break;
226103
default:
227104
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
228105
break;
@@ -241,6 +118,9 @@ gad_scan_options_get_property(GObject *object,
241118
case PROP_BATCH_SIZE:
242119
g_value_set_int64(value, priv->scan_options->batch_size);
243120
break;
121+
case PROP_USE_THREADS:
122+
g_value_set_boolean(value, priv->scan_options->use_threads);
123+
break;
244124
default:
245125
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
246126
break;
@@ -294,6 +174,20 @@ gad_scan_options_class_init(GADScanOptionsClass *klass)
294174
scan_options->batch_size,
295175
static_cast<GParamFlags>(G_PARAM_READWRITE));
296176
g_object_class_install_property(gobject_class, PROP_BATCH_SIZE, spec);
177+
178+
/**
179+
* GADScanOptions:use-threads:
180+
*
181+
* Indicate if the Scanner should make use of a ThreadPool.
182+
*
183+
* Since: 4.0.0
184+
*/
185+
spec = g_param_spec_boolean("use-threads",
186+
"Use threads",
187+
"Indicate if the Scanner should make use of a ThreadPool",
188+
scan_options->use_threads,
189+
static_cast<GParamFlags>(G_PARAM_READWRITE));
190+
g_object_class_install_property(gobject_class, PROP_USE_THREADS, spec);
297191
}
298192

299193
/**
@@ -334,14 +228,12 @@ gad_scan_options_get_schema(GADScanOptions *scan_options)
334228
typedef struct GADScanTaskPrivate_ {
335229
std::shared_ptr<arrow::dataset::ScanTask> scan_task;
336230
GADScanOptions *options;
337-
GADScanContext *context;
338231
GADFragment *fragment;
339232
} GADScanTaskPrivate;
340233

341234
enum {
342235
PROP_SCAN_TASK = 1,
343236
PROP_OPTIONS,
344-
PROP_CONTEXT,
345237
PROP_FRAGMENT,
346238
};
347239

@@ -364,11 +256,6 @@ gad_scan_task_dispose(GObject *object)
364256
priv->options = NULL;
365257
}
366258

367-
if (priv->context) {
368-
g_object_unref(priv->context);
369-
priv->context = NULL;
370-
}
371-
372259
if (priv->fragment) {
373260
g_object_unref(priv->fragment);
374261
priv->fragment = NULL;
@@ -403,9 +290,6 @@ gad_scan_task_set_property(GObject *object,
403290
case PROP_OPTIONS:
404291
priv->options = GAD_SCAN_OPTIONS(g_value_dup_object(value));
405292
break;
406-
case PROP_CONTEXT:
407-
priv->context = GAD_SCAN_CONTEXT(g_value_dup_object(value));
408-
break;
409293
case PROP_FRAGMENT:
410294
priv->fragment = GAD_FRAGMENT(g_value_dup_object(value));
411295
break;
@@ -427,9 +311,6 @@ gad_scan_task_get_property(GObject *object,
427311
case PROP_OPTIONS:
428312
g_value_set_object(value, priv->options);
429313
break;
430-
case PROP_CONTEXT:
431-
g_value_set_object(value, priv->context);
432-
break;
433314
case PROP_FRAGMENT:
434315
g_value_set_object(value, priv->fragment);
435316
break;
@@ -479,21 +360,6 @@ gad_scan_task_class_init(GADScanTaskClass *klass)
479360
G_PARAM_CONSTRUCT_ONLY));
480361
g_object_class_install_property(gobject_class, PROP_OPTIONS, spec);
481362

482-
/**
483-
* GADScanTask:context:
484-
*
485-
* The context of the scan task.
486-
*
487-
* Since: 1.0.0
488-
*/
489-
spec = g_param_spec_object("context",
490-
"Context",
491-
"The context of the scan task",
492-
GAD_TYPE_SCAN_CONTEXT,
493-
static_cast<GParamFlags>(G_PARAM_READWRITE |
494-
G_PARAM_CONSTRUCT_ONLY));
495-
g_object_class_install_property(gobject_class, PROP_CONTEXT, spec);
496-
497363
/**
498364
* GADScanTask:fragment:
499365
*
@@ -531,27 +397,6 @@ gad_scan_task_get_options(GADScanTask *scan_task)
531397
return gad_scan_options_new_raw(&arrow_options);
532398
}
533399

534-
/**
535-
* gad_scan_task_get_context:
536-
* @scan_task: A #GADScanTask.
537-
*
538-
* Returns: (transfer full): A #GADScanContext.
539-
*
540-
* Since: 1.0.0
541-
*/
542-
GADScanContext *
543-
gad_scan_task_get_context(GADScanTask *scan_task)
544-
{
545-
auto priv = GAD_SCAN_TASK_GET_PRIVATE(scan_task);
546-
if (priv->context) {
547-
g_object_ref(priv->context);
548-
return priv->context;
549-
}
550-
551-
auto arrow_context = priv->scan_task->context();
552-
return gad_scan_context_new_raw(&arrow_context);
553-
}
554-
555400
/**
556401
* gad_scan_task_get_fragment:
557402
* @scan_task: A #GADFragment.
@@ -618,7 +463,6 @@ gad_in_memory_scan_task_class_init(GADInMemoryScanTaskClass *klass)
618463
* (element-type GArrowRecordBatch): The record batches of the table.
619464
* @n_record_batches: The number of record batches.
620465
* @options: A #GADScanOptions.
621-
* @context: A #GADScanContext.
622466
* @fragment: A #GADInMemoryFragment.
623467
*
624468
* Returns: A newly created #GADInMemoryScanTask.
@@ -629,7 +473,6 @@ GADInMemoryScanTask *
629473
gad_in_memory_scan_task_new(GArrowRecordBatch **record_batches,
630474
gsize n_record_batches,
631475
GADScanOptions *options,
632-
GADScanContext *context,
633476
GADInMemoryFragment *fragment)
634477
{
635478
std::vector<std::shared_ptr<arrow::RecordBatch>> arrow_record_batches;
@@ -639,38 +482,18 @@ gad_in_memory_scan_task_new(GArrowRecordBatch **record_batches,
639482
arrow_record_batches.push_back(arrow_record_batch);
640483
}
641484
auto arrow_options = gad_scan_options_get_raw(options);
642-
auto arrow_context = gad_scan_context_get_raw(context);
643485
auto arrow_fragment = gad_fragment_get_raw(GAD_FRAGMENT(fragment));
644486
auto arrow_in_memory_scan_task =
645487
std::make_shared<arrow::dataset::InMemoryScanTask>(arrow_record_batches,
646488
arrow_options,
647-
arrow_context,
648489
arrow_fragment);
649490
return gad_in_memory_scan_task_new_raw(&arrow_in_memory_scan_task,
650491
options,
651-
context,
652492
fragment);
653493
}
654494

655495
G_END_DECLS
656496

657-
GADScanContext *
658-
gad_scan_context_new_raw(std::shared_ptr<arrow::dataset::ScanContext> *arrow_scan_context)
659-
{
660-
auto scan_context =
661-
GAD_SCAN_CONTEXT(g_object_new(GAD_TYPE_SCAN_CONTEXT,
662-
"scan-context", arrow_scan_context,
663-
NULL));
664-
return scan_context;
665-
}
666-
667-
std::shared_ptr<arrow::dataset::ScanContext>
668-
gad_scan_context_get_raw(GADScanContext *scan_context)
669-
{
670-
auto priv = GAD_SCAN_CONTEXT_GET_PRIVATE(scan_context);
671-
return priv->scan_context;
672-
}
673-
674497
GADScanOptions *
675498
gad_scan_options_new_raw(std::shared_ptr<arrow::dataset::ScanOptions> *arrow_scan_options)
676499
{
@@ -691,14 +514,12 @@ gad_scan_options_get_raw(GADScanOptions *scan_options)
691514
GADInMemoryScanTask *
692515
gad_in_memory_scan_task_new_raw(std::shared_ptr<arrow::dataset::InMemoryScanTask> *arrow_in_memory_scan_task,
693516
GADScanOptions *options,
694-
GADScanContext *context,
695517
GADInMemoryFragment *fragment)
696518
{
697519
auto in_memory_scan_task =
698520
GAD_IN_MEMORY_SCAN_TASK(g_object_new(GAD_TYPE_IN_MEMORY_SCAN_TASK,
699521
"scan-task", arrow_in_memory_scan_task,
700522
"options", options,
701-
"context", context,
702523
"fragment", fragment,
703524
NULL));
704525
return in_memory_scan_task;

c_glib/arrow-dataset-glib/scanner.h

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,22 +25,6 @@
2525

2626
G_BEGIN_DECLS
2727

28-
/* arrow::dataset::ScanContext */
29-
30-
#define GAD_TYPE_SCAN_CONTEXT (gad_scan_context_get_type())
31-
G_DECLARE_DERIVABLE_TYPE(GADScanContext,
32-
gad_scan_context,
33-
GAD,
34-
SCAN_CONTEXT,
35-
GObject)
36-
struct _GADScanContextClass
37-
{
38-
GObjectClass parent_class;
39-
};
40-
41-
GARROW_AVAILABLE_IN_1_0
42-
GADScanContext *gad_scan_context_new(void);
43-
4428
/* arrow::dataset::ScanOptions */
4529

4630
#define GAD_TYPE_SCAN_OPTIONS (gad_scan_options_get_type())
@@ -75,8 +59,6 @@ struct _GADScanTaskClass
7559

7660
GARROW_AVAILABLE_IN_1_0
7761
GADScanOptions *gad_scan_task_get_options(GADScanTask *scan_task);
78-
GARROW_AVAILABLE_IN_1_0
79-
GADScanContext *gad_scan_task_get_context(GADScanTask *scan_task);
8062
GARROW_AVAILABLE_IN_4_0
8163
GADFragment *gad_scan_task_get_fragment(GADScanTask *scan_task);
8264
GARROW_AVAILABLE_IN_1_0
@@ -101,7 +83,6 @@ GADInMemoryScanTask *
10183
gad_in_memory_scan_task_new(GArrowRecordBatch **record_batches,
10284
gsize n_record_batches,
10385
GADScanOptions *options,
104-
GADScanContext *context,
10586
GADInMemoryFragment *fragment);
10687

10788
G_END_DECLS

0 commit comments

Comments
 (0)