-
Notifications
You must be signed in to change notification settings - Fork 108
Expand file tree
/
Copy pathtutorial.rs
More file actions
2312 lines (2311 loc) · 82.7 KB
/
tutorial.rs
File metadata and controls
2312 lines (2311 loc) · 82.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//! # Developer tutorial
//!
//! This tutorial aims to be a start at teaching Rust developers how to
//! use DBSP in their projects.
//!
//! All of the programs in this tutorial are provided as examples under
//! `examples/tutorial`. You can run each of them with, e.g. `cargo run
//! --example tutorial1`.
//!
//! # Table of contents
//!
//! * [Introduction](#introduction)
//! * [Basic](#basics)
//! * [Input](#input)
//! * [Execution](#execution)
//! * [Computation and output](#computation-and-output)
//! * [More sophisticated computation](#more-sophisticated-computation)
//! * [Aggregation](#aggregation)
//! * [Rolling aggregation](#rolling-aggregation)
//! * [Joins](#joins)
//! * [Finding months with the most
//! vaccinations](#finding-months-with-the-most-vaccinations)
//! * [Vaccination rates](#vaccination-rates)
//! * [Incremental computation](#incremental-computation)
//! * [Fixed-point computation](#fixed-point-computation)
//! * [Next steps](#next-steps)
//!
//! # Introduction
//!
//! Computation in DBSP is a two-stage process. First, create a DBSP "circuit",
//! which defines the computation's structure, including its inputs and
//! outputs[^1]. Second, any number of times, feed in input changes, tell DBSP
//! to run the circuits, and then read out the output changes. A skeleton for a
//! DBSP program might look like this (the second and later steps could iterate
//! any number of times):
//!
//! [^1]: The term "circuit" is used because diagrams of DBSP computation
//! resemble those for electrical circuits. DBSP circuits are not necessarily
//! closed loops like electrical circuits.
//!
//! ```
//! fn main() {
//! // ...build circuit...
//! // ...feed data into circuit...
//! // ...execute circuit...
//! // ...read output from circuit...
//! }
//! ```
//!
//! The following section shows the basics of how to fill in each of these
//! steps.
//!
//! # Basics
//!
//! This section shows off the basics of input, computation, and output.
//! Afterward, we'll show how to do more sophisticated computation.
//!
//! ## Input
//!
//! To process data in DBSP, we need to get data from somewhere. The
//! `dbsp_adapters` crate in `crates/adapters` implements input and output
//! adapters for a number of formats and transports along with a server that
//! instantiates a DBSP pipeline and adapters based on a user-provided
//! declarative configuration. In this tutorial we take a different approach,
//! instantiating the pipeline and pushing data to it directly using the Rust
//! API. Specifically, we will parse some data from a CSV file and bring it
//! into a circuit.
//!
//! Let's work with the [Our World in Data](https://ourworldindata.org/)
//! public-domain dataset on COVID-19 vaccinations, which is available on
//! Github. Its main data file on vaccinations is `vaccinations.csv`, which
//! contains about 168,000 rows of data. That's a lot to stick in the DBSP
//! repo, so we've included a subset with data for just a few countries. The
//! full version of the snapshot of the data excerpted here is [freely
//! available](https://github.com/owid/covid-19-data/blob/88ab53d1081ef7651b16212658ea43bd175d572a/public/data/vaccinations/vaccinations.csv)
//! on Github.
//!
//! The vaccination data has 16 columns per row. We will only look at three of
//! those: `location`, a country name; `date`, a date in the form `yyyy-mm-dd`;
//! and `daily_vaccinations`, the number of vaccinations given on `date` in
//! `location`. The latter field is sometimes blank.
//!
//! Rust crates have good support for reading this data. We can combine the
//! `csv` crate to read CSV files with `serde` for deserializing into a `struct`
//! and `time` for parsing the date field. A full program for parsing and
//! printing the data is below and in `tutorial1.rs`:
//! ```rust
//! use anyhow::Result;
//! use csv::Reader;
//! use serde::Deserialize;
//!
//! #[allow(dead_code)]
//! #[derive(Debug, Deserialize)]
//! struct Record {
//! location: String,
//! date: i32,
//! daily_vaccinations: Option<u64>,
//! }
//!
//! fn main() -> Result<()> {
//! let path = format!(
//! "{}/examples/tutorial/vaccinations.csv",
//! env!("CARGO_MANIFEST_DIR")
//! );
//! for result in Reader::from_path(path)?.deserialize() {
//! let record: Record = result?;
//! println!("{:?}", record);
//! }
//! Ok(())
//! }
//! ```
//!
//! If we run this, then it prints the records in `Debug` format. Here are the
//! first few:
//!
//! ```text
//! Record { location: "England", date: 18637, daily_vaccinations: None }
//! Record { location: "England", date: 18638, daily_vaccinations: Some(140441) }
//! Record { location: "England", date: 18639, daily_vaccinations: Some(164043) }
//! Record { location: "England", date: 18640, daily_vaccinations: Some(192088) }
//! Record { location: "England", date: 18641, daily_vaccinations: Some(213978) }
//! ...
//! ```
//!
//! We want to create a DBSP circuit and bring this data into it. We create a
//! circuit with [`RootCircuit::build`], which creates an empty circuit, calls a
//! callback that we pass it to add input and computation and output to the
//! circuit, and then fixes the form of the circuit and returns the circuit plus
//! anything we returned from our callback. The code skeleton is like this:
//!
//! ```
//! # use anyhow::Result;
//! # use dbsp::RootCircuit;
//! fn build_circuit(circuit: &mut RootCircuit) -> Result<()> {
//! // ...populate `circuit` with operators...
//! Ok((/*handles*/))
//! }
//!
//! fn main() -> Result<()> {
//! // Build circuit.
//! let (circuit, (/*handles*/)) = RootCircuit::build(build_circuit)?;
//!
//! // ...feed data into circuit...
//! // ...execute circuit...
//! // ...read output from circuit...
//! # Ok(())
//! }
//! ```
//!
//! The natural way to bring our data into the circuit is through a "Z-set"
//! ([`ZSet`]) input stream. A ["Z-set"] is a set in which each item is
//! associated with an integer weight. In the context of changes to a data set,
//! positive weights represent insertions and negative weights represent
//! deletions. The magnitude of the weight represents a count, so that a weight
//! of 1 represents an insertion of a single copy of a record, 2 represents two
//! copies, and so on, and similarly for negative weights and deletions. Thus,
//! a Z-set represents changes to a multiset.
//!
//! ["Z-set"]: https://www.feldera.com/blog/z-sets-representing-database-changes
//!
//! We create the Z-set input stream inside `build_circuit` using
//! [`RootCircuit::add_input_zset`], which returns a [`Stream`] for further use
//! in `build_circuit` and a [`ZSetHandle`] for `main` to use to feed in
//! data. Our skeleton fills in as shown below. We're jumping the gun a bit by
//! adding a call to [`inspect`](Stream::inspect) on the `Stream`. This method
//! calls a closure on each batch of data that passes through; we're having it
//! print the total weight in our Z-set just to demonstrate that something is
//! happening:
//!
//! ```rust
//! # use anyhow::Result;
//! # use chrono::Datelike;
//! # use csv::Reader;
//! # use dbsp::utils::Tup2;
//! # use dbsp::{RootCircuit, ZSet, ZSetHandle};
//! # use rkyv::{Archive, Serialize};
//! # use size_of::SizeOf;
//! #
//! # #[derive(
//! # Clone,
//! # Default,
//! # Debug,
//! # Eq,
//! # PartialEq,
//! # Ord,
//! # PartialOrd,
//! # Hash,
//! # SizeOf,
//! # Archive,
//! # Serialize,
//! # rkyv::Deserialize,
//! # serde::Deserialize,
//! # feldera_macros::IsNone,
//! # )]
//! # #[archive_attr(derive(Ord, Eq, PartialEq, PartialOrd))]
//! # struct Record {
//! # location: String,
//! # date: i32,
//! # daily_vaccinations: Option<u64>,
//! # }
//! fn build_circuit(circuit: &mut RootCircuit) -> Result<ZSetHandle<Record>> {
//! let (input_stream, input_handle) = circuit.add_input_zset::<Record>();
//! input_stream.inspect(|records| {
//! println!("{}", records.weighted_count());
//! });
//! // ...populate `circuit` with more operators...
//! Ok(input_handle)
//! }
//! fn main() -> Result<()> {
//! // Build circuit.
//! let (circuit, input_handle) = RootCircuit::build(build_circuit)?;
//!
//! // ...feed data into circuit...
//! // ...execute circuit...
//! // ...read output from circuit...
//! # Ok(())
//! }
//! ```
//!
//! The best way to feed the records into `input_handle` is to collect them into
//! a `Vec<(Record, ZWeight)>`, where `ZWeight` (an alias for `i64`) is the
//! Z-set weight. All the weights can be 1, since we are inserting each of
//! them. We feed them in with [`ZSetHandle::append`]. So, we can fill in `//
//! ...feed data into circuit...` with:
//!
//! ```rust
//! # use anyhow::Result;
//! # use chrono::Datelike;
//! # use csv::Reader;
//! # use dbsp::utils::Tup2;
//! # use dbsp::{RootCircuit, ZSet, ZSetHandle};
//! # use dbsp::algebra::zset::ZWeight;
//! # use rkyv::{Archive, Serialize};
//! # use size_of::SizeOf;
//! #
//! # #[derive(
//! # Clone,
//! # Default,
//! # Debug,
//! # Eq,
//! # PartialEq,
//! # Ord,
//! # PartialOrd,
//! # Hash,
//! # SizeOf,
//! # Archive,
//! # Serialize,
//! # rkyv::Deserialize,
//! # serde::Deserialize,
//! # feldera_macros::IsNone,
//! # )]
//! # #[archive_attr(derive(Ord, Eq, PartialEq, PartialOrd))]
//! # struct Record {
//! # location: String,
//! # date: i32,
//! # daily_vaccinations: Option<u64>,
//! # }
//! # fn build_circuit(circuit: &mut RootCircuit) -> Result<ZSetHandle<Record>> {
//! # let (input_stream, input_handle) = circuit.add_input_zset::<Record>();
//! # input_stream.inspect(|records| {
//! # println!("{}", records.weighted_count());
//! # });
//! # // ...populate `circuit` with more operators...
//! # Ok(input_handle)
//! # }
//! #
//! # fn main() -> Result<()> {
//! # // Build circuit.
//! # let (circuit, input_handle) = RootCircuit::build(build_circuit)?;
//! #
//! // Feed data into circuit.
//! let path = format!(
//! "{}/examples/tutorial/vaccinations.csv",
//! env!("CARGO_MANIFEST_DIR")
//! );
//! let mut input_records = Reader::from_path(path)?
//! .deserialize()
//! .map(|result| result.map(|record| Tup2(record, 1)))
//! .collect::<Result<Vec<Tup2<Record, ZWeight>>, _>>()?;
//! input_handle.append(&mut input_records);
//!
//! # // Execute circuit.
//! # circuit.transaction()?;
//! #
//! # // ...read output from circuit...
//! # Ok(())
//!}
//! ```
//!
//! > 💡 The code above uses `Tup2<Record, ZWeight>` where `(Record, ZWeight)`
//! would be the obvious type. DBSP has its own tuple-like types [Tup0],
//! [Tup1], ..., [Tup10] because Rust does not allow DBSP to [implement foreign
//! traits] on the standard tuple types.
//!
//! [implement foreign traits]: https://doc.rust-lang.org/reference/items/implementations.html#r-items.impl.trait.orphan-rule
//!
//! The compiler will point out a problem: `Record` lacks several traits
//! required for the record type of the "Z-sets". We need `SizeOf` from the
//! `size_of` crate and `Archive`, `Serialize`, and `Deserialize` from the
//! `rkyv` crate. We can derive all of them:
//!
//! ```
//! use rkyv::{Archive, Serialize};
//! use size_of::SizeOf;
//! use chrono::Datelike;
//! use dbsp::utils;
//!
//! #[derive(
//! Clone,
//! Default,
//! Debug,
//! Eq,
//! PartialEq,
//! Ord,
//! PartialOrd,
//! Hash,
//! SizeOf,
//! Archive,
//! Serialize,
//! rkyv::Deserialize,
//! serde::Deserialize,
//! feldera_macros::IsNone,
//! )]
//! #[archive_attr(derive(Ord, Eq, PartialEq, PartialOrd))]
//! # struct Record {
//! # location: String,
//! # date: i32,
//! # daily_vaccinations: Option<u64>,
//! # }
//! ```
//!
//! > 💡 There are two `Deserialize` traits above. DBSP requires
//! `rkyv::Deserialize` to support distributed computations, by allowing data to
//! be moved from one host to another. Our example uses `serde::Deserialize` to
//! parse CSV.
//!
//! ## Execution
//!
//! Our program now builds a circuit and feeds data into it. To execute it, we
//! just replace `// ...execute circuit...` with a call to
//! [`CircuitHandle::transaction`]:
//!
//! ```
//! # use anyhow::Result;
//! # use chrono::Datelike;
//! # use csv::Reader;
//! # use dbsp::algebra::zset::ZWeight;
//! # use dbsp::utils::Tup2;
//! # use dbsp::{RootCircuit, ZSet, ZSetHandle};
//! # use rkyv::{Archive, Serialize};
//! # use size_of::SizeOf;
//! #
//! # #[derive(
//! # Clone,
//! # Default,
//! # Debug,
//! # Eq,
//! # PartialEq,
//! # Ord,
//! # PartialOrd,
//! # Hash,
//! # SizeOf,
//! # Archive,
//! # Serialize,
//! # rkyv::Deserialize,
//! # serde::Deserialize,
//! # feldera_macros::IsNone,
//! # )]
//! # #[archive_attr(derive(Ord, Eq, PartialEq, PartialOrd))]
//! # struct Record {
//! # location: String,
//! # date: i32,
//! # daily_vaccinations: Option<u64>,
//! # }
//! # fn build_circuit(circuit: &mut RootCircuit) -> Result<ZSetHandle<Record>> {
//! # let (input_stream, input_handle) = circuit.add_input_zset::<Record>();
//! # input_stream.inspect(|records| {
//! # println!("{}", records.weighted_count());
//! # });
//! # // ...populate `circuit` with more operators...
//! # Ok(input_handle)
//! # }
//! #
//! # fn main() -> Result<()> {
//! # // Build circuit.
//! # let (circuit, input_handle) = RootCircuit::build(build_circuit)?;
//! #
//! # // Feed data into circuit.
//! # let path = format!(
//! # "{}/examples/tutorial/vaccinations.csv",
//! # env!("CARGO_MANIFEST_DIR")
//! # );
//! # let mut input_records = Reader::from_path(path)?
//! # .deserialize()
//! # .map(|result| result.map(|record| Tup2(record, 1)))
//! # .collect::<Result<Vec<Tup2<Record, ZWeight>>, _>>()?;
//! # input_handle.append(&mut input_records);
//! #
//! // Execute circuit.
//! circuit.transaction()?;
//! #
//! # // ...read output from circuit...
//! # Ok(())
//! # }
//! ```
//!
//! Now, if you run our program, with `cargo run --example tutorial2`, it prints
//! `3961`, the number of records in `vaccinations.csv`. That's because our
//! program reads an entire CSV file and feeds it as input in a single step.
//! That means that running for more steps wouldn't make a difference. That's
//! not a normal use case for DBSP but, arguably, it's a reasonable setup for a
//! tutorial.
//!
//! ## Computation and output
//!
//! We haven't done any computation inside the circuit, nor have we brought
//! output back out of the circuit yet. Let's add both of those to our
//! skeleton.
//!
//! Let's do just enough computation to demonstrate the concept. Suppose we
//! want to pick out a subset of the records. We can use [`Stream::filter`] to
//! do that. For example, we can take just the records for locations in the
//! United Kingdom:
//!
//! ```
//! # use anyhow::Result;
//! # use chrono::Datelike;
//! # use csv::Reader;
//! # use dbsp::algebra::zset::ZWeight;
//! # use dbsp::utils::Tup2;
//! # use dbsp::{OrdZSet, OutputHandle, RootCircuit, ZSet, ZSetHandle};
//! # use rkyv::{Archive, Serialize};
//! # use size_of::SizeOf;
//! #
//! # #[derive(
//! # Clone,
//! # Default,
//! # Debug,
//! # Eq,
//! # PartialEq,
//! # Ord,
//! # PartialOrd,
//! # Hash,
//! # SizeOf,
//! # Archive,
//! # Serialize,
//! # rkyv::Deserialize,
//! # serde::Deserialize,
//! # feldera_macros::IsNone,
//! # )]
//! # #[archive_attr(derive(Ord, Eq, PartialEq, PartialOrd))]
//! # struct Record {
//! # location: String,
//! # date: i32,
//! # daily_vaccinations: Option<u64>,
//! # }
//! # fn build_circuit(
//! # circuit: &mut RootCircuit,
//! # ) -> Result<(ZSetHandle<Record>, OutputHandle<OrdZSet<Record>>)> {
//! # let (input_stream, input_handle) = circuit.add_input_zset::<Record>();
//! # input_stream.inspect(|records| {
//! # println!("{}", records.weighted_count());
//! # });
//! let subset = input_stream.filter(|r| {
//! r.location == "England"
//! || r.location == "Northern Ireland"
//! || r.location == "Scotland"
//! || r.location == "Wales"
//! });
//! # Ok((input_handle, subset.output()))
//! # }
//! #
//! # fn main() -> Result<()> {
//! # // Build circuit.
//! # let (circuit, (input_handle, output_handle)) = RootCircuit::build(build_circuit)?;
//! #
//! # // Feed data into circuit.
//! # let path = format!(
//! # "{}/examples/tutorial/vaccinations.csv",
//! # env!("CARGO_MANIFEST_DIR")
//! # );
//! # let mut input_records = Reader::from_path(path)?
//! # .deserialize()
//! # .map(|result| result.map(|record| Tup2(record, 1)))
//! # .collect::<Result<Vec<Tup2<Record, ZWeight>>, _>>()?;
//! # input_handle.append(&mut input_records);
//! #
//! # // Execute circuit.
//! # circuit.transaction()?;
//! #
//! # // Read output from circuit.
//! # println!("{}", output_handle.consolidate().weighted_count());
//! #
//! # Ok(())
//! # }
//! ```
//!
//! We could call `inspect` again to print the results. Instead, let's bring
//! the results out of the computation into `main` and print them there. That's
//! just a matter of calling [`Stream::output`], which returns [`OutputHandle`]
//! to return to `main`, which can then read out the data after each step. Our
//! `build_circuit` then looks like this:
//!
//! ```
//! # use anyhow::Result;
//! # use chrono::Datelike;
//! # use csv::Reader;
//! # use dbsp::algebra::zset::ZWeight;
//! # use dbsp::utils::Tup2;
//! # use dbsp::{OrdZSet, OutputHandle, RootCircuit, ZSet, ZSetHandle};
//! # use rkyv::{Archive, Serialize};
//! # use size_of::SizeOf;
//! #
//! # #[derive(
//! # Clone,
//! # Default,
//! # Debug,
//! # Eq,
//! # PartialEq,
//! # Ord,
//! # PartialOrd,
//! # Hash,
//! # SizeOf,
//! # Archive,
//! # Serialize,
//! # rkyv::Deserialize,
//! # serde::Deserialize,
//! # feldera_macros::IsNone,
//! # )]
//! # #[archive_attr(derive(Ord, Eq, PartialEq, PartialOrd))]
//! # struct Record {
//! # location: String,
//! # date: i32,
//! # daily_vaccinations: Option<u64>,
//! # }
//! fn build_circuit(
//! circuit: &mut RootCircuit,
//! ) -> Result<(ZSetHandle<Record>, OutputHandle<OrdZSet<Record>>)> {
//! let (input_stream, input_handle) = circuit.add_input_zset::<Record>();
//! input_stream.inspect(|records| {
//! println!("{}", records.weighted_count());
//! });
//! let subset = input_stream.filter(|r| {
//! r.location == "England"
//! || r.location == "Northern Ireland"
//! || r.location == "Scotland"
//! || r.location == "Wales"
//! });
//! Ok((input_handle, subset.output()))
//! }
//! #
//! # fn main() -> Result<()> {
//! # // Build circuit.
//! # let (circuit, (input_handle, output_handle)) = RootCircuit::build(build_circuit)?;
//! #
//! # // Feed data into circuit.
//! # let path = format!(
//! # "{}/examples/tutorial/vaccinations.csv",
//! # env!("CARGO_MANIFEST_DIR")
//! # );
//! # let mut input_records = Reader::from_path(path)?
//! # .deserialize()
//! # .map(|result| result.map(|record| Tup2(record, 1)))
//! # .collect::<Result<Vec<Tup2<Record, ZWeight>>, _>>()?;
//! # input_handle.append(&mut input_records);
//! #
//! # // Execute circuit.
//! # circuit.transaction()?;
//! #
//! # // Read output from circuit.
//! # println!("{}", output_handle.consolidate().weighted_count());
//! #
//! # Ok(())
//! # }
//! ```
//!
//! Back in `main`, we need to update the call to [`RootCircuit::build`] so that
//! we save the new `output_handle`. Then, after we feed in input and execute
//! the circuit, we can read the output. For general kinds of output, it can be
//! a little tricky using `OutputHandle`, because it supports multithreaded DBSP
//! runtimes that produce one output per thread. For Z-set output, one can just
//! call its [`consolidate`](`OutputHandle::consolidate`) method, which
//! internally merges the multiple outputs if multiple threads are in use. To
//! print the number of records, we can just do the following:
//!
//! ```
//! # use anyhow::Result;
//! # use chrono::Datelike;
//! # use csv::Reader;
//! # use dbsp::algebra::zset::ZWeight;
//! # use dbsp::utils::Tup2;
//! # use dbsp::{OrdZSet, OutputHandle, RootCircuit, ZSet, ZSetHandle};
//! # use rkyv::{Archive, Serialize};
//! # use size_of::SizeOf;
//! #
//! # #[derive(
//! # Clone,
//! # Default,
//! # Debug,
//! # Eq,
//! # PartialEq,
//! # Ord,
//! # PartialOrd,
//! # Hash,
//! # SizeOf,
//! # Archive,
//! # Serialize,
//! # rkyv::Deserialize,
//! # serde::Deserialize,
//! # feldera_macros::IsNone,
//! # )]
//! # #[archive_attr(derive(Ord, Eq, PartialEq, PartialOrd))]
//! # struct Record {
//! # location: String,
//! # date: i32,
//! # daily_vaccinations: Option<u64>,
//! # }
//! # fn build_circuit(
//! # circuit: &mut RootCircuit,
//! # ) -> Result<(ZSetHandle<Record>, OutputHandle<OrdZSet<Record>>)> {
//! # let (input_stream, input_handle) = circuit.add_input_zset::<Record>();
//! # input_stream.inspect(|records| {
//! # println!("{}", records.weighted_count());
//! # });
//! # let subset = input_stream.filter(|r| {
//! # r.location == "England"
//! # || r.location == "Northern Ireland"
//! # || r.location == "Scotland"
//! # || r.location == "Wales"
//! # });
//! # Ok((input_handle, subset.output()))
//! # }
//! #
//! # fn main() -> Result<()> {
//! // Build circuit.
//! let (circuit, (input_handle, output_handle)) = RootCircuit::build(build_circuit)?;
//! #
//! # // Feed data into circuit.
//! # let path = format!(
//! # "{}/examples/tutorial/vaccinations.csv",
//! # env!("CARGO_MANIFEST_DIR")
//! # );
//! # let mut input_records = Reader::from_path(path)?
//! # .deserialize()
//! # .map(|result| result.map(|record| Tup2(record, 1)))
//! # .collect::<Result<Vec<Tup2<Record, ZWeight>>, _>>()?;
//! # input_handle.append(&mut input_records);
//! #
//! # // Execute circuit.
//! # circuit.transaction()?;
//! #
//! // ...unchanged code to feed data into circuit and execute circuit...
//!
//! // Read output from circuit.
//! println!("{}", output_handle.consolidate().weighted_count());
//! #
//! # Ok(())
//! # }
//! ```
//!
//! Now, if we run it, it prints `3961`, as before, followed by `3083`. The
//! latter is from the `println!` in `main` and shows that we did select a
//! subset of the 3,961 total records.
//!
//! The full program is in `tutorial3.rs`.
//!
//! # More sophisticated computation
//!
//! Our program only does trivial computation, but DBSP supports much more
//! sophistication. Let's look at some of what it can do.
//!
//! ## Aggregation
//!
//! 3,083 records is a lot. There's so much because we've got years of daily
//! data. Let's aggregate daily vaccinations into months, to get monthly
//! vaccinations. DBSP has several forms of aggregation. All of them work with
//! "indexed Z-sets" ([`IndexedZSet`]), which are Z-sets of key-value pairs,
//! that is, they associate key-value pairs with weights. Aggregation happens
//! across records with the same key.
//!
//! We will do the equivalent of the following SQL:
//!
//! ```text
//! SELECT SUM(daily_vaccinations) FROM vaccinations GROUP BY location, year, month.
//! ```
//!
//! where `year` and `month` are derived from `date`.
//!
//! To aggregate daily vaccinations over months by location, we need to
//! transform our Z-set into an indexed Z-set where the key (the index) has the
//! form `(location, year, month)` and the value is daily vaccinations (we could
//! keep the whole record but we'd just throw away most of it later).
//! To do this, we call [`Stream::map_index`], passing in a function that maps
//! a record into a key-value tuple:
//!
//! ```ignore
//! let monthly_totals = subset
//! .map_index(|r| {
//! let date = chrono::NaiveDate::from_epoch_days(r.date).unwrap();
//! (
//! Tup3(r.location.clone(), date.year(), date.month() as u8),
//! r.daily_vaccinations.unwrap_or(0),
//! )
//! })
//! ```
//!
//! We need to clone the location because it is a `String` that the records
//! incorporate by value.
//!
//! Then we can call [`Stream::aggregate_linear`], the simplest form of
//! aggregation in DBSP, to sum across months. This function sums the output of
//! a function. To get monthly vaccinations, we just sum the values from our
//! indexed Z-set (we have to convert to `ZWeight` because aggregation
//! implicitly multiplies by record weights):
//!
//! ```ignore
//! .aggregate_linear(|v| *v as ZWeight);
//! ```
//!
//! We output the indexed Z-set as before, and then in `main` print it record by
//! record:
//!
//! ```
//! # use anyhow::Result;
//! # use chrono::Datelike;
//! # use csv::Reader;
//! # use dbsp::utils::{Tup2, Tup3};
//! # use dbsp::{OrdIndexedZSet, OutputHandle, RootCircuit, ZSetHandle, ZWeight, IndexedZSetReader};
//! # use rkyv::{Archive, Serialize};
//! # use size_of::SizeOf;
//! #
//! # #[derive(
//! # Clone,
//! # Default,
//! # Debug,
//! # Eq,
//! # PartialEq,
//! # Ord,
//! # PartialOrd,
//! # Hash,
//! # SizeOf,
//! # Archive,
//! # Serialize,
//! # rkyv::Deserialize,
//! # serde::Deserialize,
//! # feldera_macros::IsNone,
//! # )]
//! # #[archive_attr(derive(Ord, Eq, PartialEq, PartialOrd))]
//! # struct Record {
//! # location: String,
//! # date: i32,
//! # daily_vaccinations: Option<u64>,
//! # }
//! #
//! fn build_circuit(
//! circuit: &mut RootCircuit,
//! ) -> Result<(
//! ZSetHandle<Record>,
//! OutputHandle<OrdIndexedZSet<Tup3<String, i32, u8>, ZWeight>>,
//! )> {
//! # let (input_stream, input_handle) = circuit.add_input_zset::<Record>();
//! # let subset = input_stream.filter(|r| {
//! # r.location == "England"
//! # || r.location == "Northern Ireland"
//! # || r.location == "Scotland"
//! # || r.location == "Wales"
//! # });
//! # let monthly_totals = subset
//! # .map_index(|r| {
//! # let date = chrono::NaiveDate::from_epoch_days(r.date).unwrap();
//! # (
//! # Tup3(r.location.clone(), date.year(), date.month() as u8),
//! # r.daily_vaccinations.unwrap_or(0),
//! # )
//! # })
//! # .aggregate_linear(|v| *v as ZWeight);
//! // ...
//! Ok((input_handle, monthly_totals.output()))
//! }
//!
//! fn main() -> Result<()> {
//! # let (circuit, (input_handle, output_handle)) = RootCircuit::build(build_circuit)?;
//! #
//! # let path = format!(
//! # "{}/examples/tutorial/vaccinations.csv",
//! # env!("CARGO_MANIFEST_DIR")
//! # );
//! # let mut input_records = Reader::from_path(path)?
//! # .deserialize()
//! # .map(|result| result.map(|record| Tup2(record, 1)))
//! # .collect::<Result<Vec<Tup2<Record, ZWeight>>, _>>()?;
//! # input_handle.append(&mut input_records);
//! #
//! # circuit.transaction()?;
//! #
//! // ...
//! output_handle
//! .consolidate()
//! .iter()
//! .for_each(|(Tup3(l, y, m), sum, w)| println!("{l:16} {y}-{m:02} {sum:10}: {w:+}"));
//!
//! Ok(())
//! }
//! ```
//!
//! The output looks like the following. The `+1`s are the Z-set weights. They
//! show that each record represents an insertion of a single row:
//!
//! ```text
//! England 2021-01 5600174: +1
//! England 2021-02 9377418: +1
//! England 2021-03 11861175: +1
//! England 2021-04 11288945: +1
//! England 2021-05 13772946: +1
//! England 2021-06 10944915: +1
//! ...
//! Northern Ireland 2021-01 150315: +1
//! Northern Ireland 2021-02 317074: +1
//! ...
//! Wales 2023-01 33838: +1
//! Wales 2023-02 17098: +1
//! Wales 2023-03 8776: +1
//! ```
//!
//! The full program is in `tutorial4.rs`.
//!
//! ### Rolling aggregation
//!
//! By using a "moving average" to average recent data,
//! we can obtain a dataset with less noise due to variation from month
//! to month. DBSP
//! provides [`Stream::partitioned_rolling_average`] for this purpose. To
//! use it, we have to index our Z-set by time. DBSP uses the
//! time component, which must have an unsigned integer type, to
//! define the window:
//!
//! ```ignore
//! let moving_averages = monthly_totals
//! .map_index(|(Tup3(l, y, m), v)| (*y as u32 * 12 + (*m as u32 - 1), Tup2(l.clone(), *v)))
//! ```
//!
//! Once we've done that, computing the moving average is easy. Here's how we
//! get the average of the current month and the two preceding months (when
//! they're in the data set):
//!
//! ```ignore
//! .partitioned_rolling_average(
//! |Tup2(l, v)| (l.clone(), *v),
//! RelRange::new(RelOffset::Before(2), RelOffset::Before(0)))
//! ```
//!
//! As the name of the function suggests, `partitioned_rolling_average`
//! computes a rolling average within a partition. In this case, we
//! partition the data by country. The first argument of the function
//! is a closure that splits the value component of the input indexed
//! Z-set into a partition key and a value.
//! [`partitioned_rolling_average`](`Stream::partitioned_rolling_average`)
//! returns a partitioned indexed Z-set ([`OrdPartitionedIndexedZSet`]).
//! This is just an indexed Z-set in which the key is the "partition"
//! within which averaging occurs (for us, this is the country), and the
//! value is a tuple of a "timestamp" and a value. Note that the value
//! type has an `Option` wrapped around it. In our case, for example, the
//! input value type was `i64`, so the output value type is `Option<i64>`.
//! The output for a given row is `None` if there are no rows in the window,
//! which can only happen if the range passed in does not include the 0 relative
//! offset (i.e. the current row). Ours does include 0, so `None` will never
//! occur in our output.
//!
//! Let's re-map to recover year and month from the timestamp that we made and
//! to strip off the `Option`:
//!
//! ```ignore
//! .map_index(|(l, Tup2(date, avg))| (Tup3(l.clone(), date / 12, date % 12 + 1), avg.unwrap()));
//! ```
//!
//! If we adjust the `build_circuit` return type and return value, like shown
//! below, the existing code in `main` will print it just fine.
//!
//! ```
//! # use anyhow::Result;
//! # use chrono::Datelike;
//! # use csv::Reader;
//! # use dbsp::{
//! # operator::time_series::{RelOffset, RelRange},
//! # utils::{Tup2, Tup3},
//! # OrdIndexedZSet, OutputHandle, RootCircuit, ZSetHandle, ZWeight, IndexedZSetReader
//! # };
//! # use rkyv::{Archive, Serialize};
//! # use size_of::SizeOf;
//! #
//! # #[derive(
//! # Clone,
//! # Default,
//! # Debug,
//! # Eq,
//! # PartialEq,
//! # Ord,
//! # PartialOrd,
//! # Hash,
//! # SizeOf,
//! # Archive,
//! # Serialize,
//! # rkyv::Deserialize,
//! # serde::Deserialize,
//! # feldera_macros::IsNone,
//! # )]
//! # #[archive_attr(derive(Ord, Eq, PartialEq, PartialOrd))]
//! # struct Record {
//! # location: String,
//! # date: i32,
//! # daily_vaccinations: Option<u64>,
//! # }
//! #
//! fn build_circuit(
//! circuit: &mut RootCircuit,
//! ) -> Result<(
//! ZSetHandle<Record>,
//! OutputHandle<OrdIndexedZSet<Tup3<String, u32, u32>, ZWeight>>,
//! )> {
//! # let (input_stream, input_handle) = circuit.add_input_zset::<Record>();
//! # let subset = input_stream.filter(|r| {
//! # r.location == "England"
//! # || r.location == "Northern Ireland"
//! # || r.location == "Scotland"
//! # || r.location == "Wales"
//! # });
//! # let monthly_totals = subset
//! # .map_index(|r| {
//! # let date = chrono::NaiveDate::from_epoch_days(r.date).unwrap();
//! # (
//! # Tup3(r.location.clone(), date.year(), date.month() as u8),
//! # r.daily_vaccinations.unwrap_or(0),
//! # )
//! # })
//! # .aggregate_linear(|v| *v as ZWeight);
//! # let moving_averages = monthly_totals
//! # .map_index(|(Tup3(l, y, m), v)| (*y as u32 * 12 + (*m as u32 - 1), Tup2(l.clone(), *v)))
//! # .partitioned_rolling_average(
//! # |Tup2(l, v)| (l.clone(), *v),
//! # RelRange::new(RelOffset::Before(2), RelOffset::Before(0)))
//! # .map_index(|(l, Tup2(date, avg))| {
//! # (Tup3(l.clone(), date / 12, date % 12 + 1), avg.unwrap())
//! # });
//! // ...
//! Ok((input_handle, moving_averages.output()))
//! }
//! #
//! # fn main() -> Result<()> {
//! # let (circuit, (input_handle, output_handle)) = RootCircuit::build(build_circuit)?;
//! #
//! # let path = format!(
//! # "{}/examples/tutorial/vaccinations.csv",
//! # env!("CARGO_MANIFEST_DIR")
//! # );
//! # let mut input_records = Reader::from_path(path)?
//! # .deserialize()
//! # .map(|result| result.map(|record| Tup2(record, 1)))
//! # .collect::<Result<Vec<Tup2<Record, ZWeight>>, _>>()?;
//! # input_handle.append(&mut input_records);
//! #
//! # circuit.transaction()?;
//! #
//! # output_handle
//! # .consolidate()
//! # .iter()
//! # .for_each(|(Tup3(l, y, m), sum, w)| println!("{l:16} {y}-{m:02} {sum:10}: {w:+}"));
//! #
//! # Ok(())
//! # }
//! ```
//!
//! The output looks like this (you can verify that the second row is the
//! average of the first two rows in the previous output, and so on):
//!
//! ```text
//! England 2021-01 5600174: +1
//! England 2021-02 7488796: +1
//! England 2021-03 8946255: +1
//! England 2021-04 10842512: +1
//! England 2021-05 12307688: +1
//! England 2021-06 12002268: +1
//! ...
//! Northern Ireland 2021-01 150315: +1
//! Northern Ireland 2021-02 233694: +1
//! ...
//! Wales 2021-01 295057: +1
//! Wales 2021-02 458273: +1
//! Wales 2021-03 584463: +1
//! ```
//!
//! The whole program is in `tutorial5.rs`.
//!
//! ### Joins
//!
//! Suppose we want both the current month's vaccination count and the moving
//! average together. With enough work, we could get them with just
//! aggregation by writing our own "aggregator" ([`Aggregator`]). It's a
//! little easier to do a join, and it gives us a chance to show how to do that.
//! Both our monthly vaccination counts and our moving averages are indexed