Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,124 changes: 559 additions & 565 deletions Cargo.lock

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions columnq/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,8 @@ hyper-tls = { version = "0.5.0", default-features = false, optional = true }
hyper-rustls = { version = "0.23.2", default-features = false, optional = true }

[dependencies.deltalake]
git = "https://github.com/delta-io/delta-rs.git"
rev = "72a9e5827e99c7d2a1cf05806ffce6f0a4449d47"
default-features = false
version = "0.10.0"
features = ["datafusion-ext"]

[dependencies.connectorx]
Expand All @@ -73,18 +72,21 @@ rustls = [
"hyper-rustls",
"reqwest/rustls-tls",
"deltalake/s3",
"deltalake/azure",
"yup-oauth2/hyper-rustls",
]
native-tls-vendored = [
"reqwest/native-tls-vendored",
"hyper-tls/vendored",
"deltalake/s3",
"deltalake/azure",
"yup-oauth2/hyper-tls",
]
native-tls = [
"reqwest/native-tls",
"hyper-tls",
"deltalake/s3",
"deltalake/azure",
"yup-oauth2/hyper-tls",
]
simd = ["datafusion/simd"]
Expand Down
209 changes: 103 additions & 106 deletions columnq/src/columnq.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use crate::io::BlobStoreType;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::convert::TryFrom;
Expand All @@ -7,91 +6,24 @@ use std::sync::Arc;
use datafusion::arrow;
use datafusion::arrow::array::as_string_array;
use datafusion::arrow::array::StringArray;
use datafusion::datasource::object_store::ObjectStoreRegistry;
use datafusion::error::{DataFusionError, Result as DatafusionResult};
use datafusion::error::DataFusionError;
use datafusion::error::Result as DatafusionResult;
pub use datafusion::execution::context::SessionConfig;
use datafusion::execution::context::SessionContext;
use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv};
use datafusion::physical_plan::collect;

use crate::error::{ColumnQError, QueryError};
use crate::query;
use crate::table::{self, KeyValueSource, TableSource};
use object_store::aws::AmazonS3Builder;
use object_store::azure::MicrosoftAzureBuilder;
use object_store::gcp::GoogleCloudStorageBuilder;
use object_store::DynObjectStore;
use object_store::ObjectStore;
use url::Url;

#[derive(Default)]
pub struct ColumnQObjectStoreRegistry {}

impl std::fmt::Debug for ColumnQObjectStoreRegistry {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("ColumnQObjectStoreRegistry").finish()
}
}

impl ColumnQObjectStoreRegistry {
pub fn get_by_url(&self, url: &Url) -> DatafusionResult<Arc<DynObjectStore>> {
self.get_store(url)
}
}
impl ObjectStoreRegistry for ColumnQObjectStoreRegistry {
fn register_store(
&self,
_url: &Url,
_store: Arc<DynObjectStore>,
) -> Option<Arc<DynObjectStore>> {
None
}

fn get_store(&self, url: &Url) -> DatafusionResult<Arc<DynObjectStore>> {
match url.host_str() {
None => Err(DataFusionError::Execution(format!(
"Missing bucket name: {}",
url.as_str()
))),
Some(host) => {
let url_schema = url.scheme();
match BlobStoreType::try_from(url_schema) {
Err(err) => Err(DataFusionError::External(Box::new(err))),
Ok(blob_type) => match blob_type {
BlobStoreType::S3 => {
let mut s3_builder = AmazonS3Builder::from_env().with_bucket_name(host);
// for minio in CI
s3_builder = s3_builder.with_allow_http(true);

match s3_builder.build() {
Ok(s3) => Ok(Arc::new(s3)),
Err(err) => Err(DataFusionError::External(Box::new(err))),
}
}
BlobStoreType::GCS => {
let gcs_builder =
GoogleCloudStorageBuilder::from_env().with_bucket_name(host);
match gcs_builder.build() {
Ok(gcs) => Ok(Arc::new(gcs)),
Err(err) => Err(DataFusionError::External(Box::new(err))),
}
}
BlobStoreType::Azure => {
let azure_builder =
MicrosoftAzureBuilder::from_env().with_container_name(host);
match azure_builder.build() {
Ok(azure) => Ok(Arc::new(azure)),
Err(err) => Err(DataFusionError::External(Box::new(err))),
}
}
_ => Err(DataFusionError::Execution(format!(
"Unsupported scheme: {url_schema}"
))),
},
}
}
}
}
}
use crate::error::{ColumnQError, QueryError};
use crate::io::BlobStoreType;
use crate::query;
use crate::table::TableIoSource;
use crate::table::{self, KeyValueSource, TableSource};

pub struct ColumnQ {
pub dfctx: SessionContext,
Expand All @@ -109,9 +41,7 @@ impl ColumnQ {
}

pub fn new_with_config(config: SessionConfig) -> Self {
let object_store_registry = ColumnQObjectStoreRegistry::default();
let rn_config =
RuntimeConfig::new().with_object_store_registry(Arc::new(object_store_registry));
let rn_config = RuntimeConfig::new();
let runtime_env = RuntimeEnv::new(rn_config).unwrap();
let dfctx = SessionContext::with_config_rt(config, Arc::new(runtime_env));

Expand All @@ -124,14 +54,78 @@ impl ColumnQ {
}

pub async fn load_table(&mut self, t: &TableSource) -> Result<(), ColumnQError> {
match &t.io_source {
TableIoSource::Uri(uri_str) => {
if let Ok(url) = Url::parse(uri_str) {
let _ = self.register_object_storage(&url);
}
}
TableIoSource::Memory(_) => {}
};

let table = table::load(t, &self.dfctx).await?;
self.schema_map.insert(t.name.clone(), table.schema());
self.dfctx.deregister_table(t.name.as_str())?;
self.dfctx.register_table(t.name.as_str(), table)?;

Ok(())
}

pub fn register_object_storage(
&mut self,
url: &Url,
) -> Result<Option<Arc<dyn ObjectStore>>, ColumnQError> {
let url_scheme = url.scheme();
let blob_type = BlobStoreType::try_from(url_scheme)?;

let object_store: DatafusionResult<Arc<DynObjectStore>> = match url.host() {
None => Err(DataFusionError::Execution(format!(
"Missing bucket name: {}",
url
))),
Some(host) => {
match blob_type {
BlobStoreType::S3 => {
let mut s3_builder =
AmazonS3Builder::from_env().with_bucket_name(host.to_string());
// for minio in CI
s3_builder = s3_builder.with_allow_http(true);

match s3_builder.build() {
Ok(s3) => Ok(Arc::new(s3)),
Err(err) => Err(DataFusionError::External(Box::new(err))),
}
}
BlobStoreType::GCS => {
let gcs_builder = GoogleCloudStorageBuilder::from_env()
.with_bucket_name(host.to_string());
match gcs_builder.build() {
Ok(gcs) => Ok(Arc::new(gcs)),
Err(err) => Err(DataFusionError::External(Box::new(err))),
}
}
BlobStoreType::Azure => {
let azure_builder =
MicrosoftAzureBuilder::from_env().with_container_name(host.to_string());
match azure_builder.build() {
Ok(azure) => Ok(Arc::new(azure)),
Err(err) => Err(DataFusionError::External(Box::new(err))),
}
}
_ => Err(DataFusionError::Execution(format!(
"Unsupported scheme: {url_scheme:?}"
))),
}
}
};
return match object_store {
Ok(store) => {
let runtime_env = self.dfctx.runtime_env();
let result_store = runtime_env.register_object_store(url, store);
Ok(result_store)
}
Err(e) => Err(ColumnQError::InvalidUri(e.to_string())),
};
}
pub async fn load_kv(&mut self, kv: KeyValueSource) -> Result<(), ColumnQError> {
use datafusion::arrow::datatypes::DataType;

Expand Down Expand Up @@ -233,50 +227,53 @@ impl Default for ColumnQ {

#[cfg(test)]
mod tests {

use std::fs::File;
use std::io::Write;
use std::{env, str::FromStr};
use tempfile::Builder;

use url::Url;

use super::ColumnQObjectStoreRegistry;
use crate::error::ColumnQError;
use crate::ColumnQ;

#[test]
fn s3_object_store_type() {
fn s3_object_store_type() -> Result<(), ColumnQError> {
env::set_var("AWS_REGION", "us-east-1");
let mut cq = ColumnQ::new();
let _ = cq.register_object_storage(&Url::parse("s3://bucket_name/path/foo.csv").unwrap());
let host_url = "s3://bucket_name/path";
let provider = ColumnQObjectStoreRegistry {};

let err = provider
.get_by_url(&Url::from_str(host_url).unwrap())
.unwrap_err();
assert!(err.to_string().contains("Generic S3 error: Missing region"));
let provider = &cq.dfctx.runtime_env().object_store_registry;

env::set_var("AWS_REGION", "us-east-1");
let res = provider.get_by_url(&Url::from_str(host_url).unwrap());
let res = provider.get_store(&Url::from_str(host_url).unwrap());
let msg = match res {
Err(e) => format!("{e}"),
Ok(_) => "".to_string(),
};
assert_eq!("".to_string(), msg);
env::remove_var("AWS_REGION");
Ok(())
}

#[test]
fn s3_object_store_type_no_bucket() {
env::set_var("AWS_REGION", "us-east-1");
let mut cq = ColumnQ::new();
let host_url = "s3://";
let provider = ColumnQObjectStoreRegistry {};

let err = provider
.get_by_url(&Url::from_str(host_url).unwrap())
let err = cq
.register_object_storage(&Url::parse(host_url).unwrap())
.unwrap_err();

assert!(err.to_string().contains("Missing bucket name: s3://"));
}

#[tokio::test]
async fn gcs_object_store_type() -> anyhow::Result<()> {
let mut cq = ColumnQ::new();
let host_url = "gs://bucket_name/path";
let provider = ColumnQObjectStoreRegistry {};
let _ = cq.register_object_storage(&Url::parse(host_url).unwrap());
let provider = &cq.dfctx.runtime_env().object_store_registry;

let tmp_dir = Builder::new().prefix("columnq.test.gcs").tempdir()?;
let tmp_gcs_path = tmp_dir.path().join("service_account.json");
Expand All @@ -287,7 +284,7 @@ mod tests {
)?;
env::set_var("GOOGLE_SERVICE_ACCOUNT", tmp_gcs_path);

let res = provider.get_by_url(&Url::from_str(host_url).unwrap());
let res = provider.get_store(&Url::from_str(host_url).unwrap());
let msg = match res {
Err(e) => format!("{e}"),
Ok(_) => "".to_string(),
Expand All @@ -302,13 +299,16 @@ mod tests {

#[test]
fn azure_object_store_type() {
let host_url = "az://bucket_name/path";
let provider = ColumnQObjectStoreRegistry {};
// https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio#http-connection-strings
env::set_var("AZURE_STORAGE_ACCOUNT_NAME", "devstoreaccount1");
env::set_var("AZURE_STORAGE_ACCOUNT_KEY", "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==");

let res = provider.get_by_url(&Url::from_str(host_url).unwrap());
let mut cq = ColumnQ::new();
let host_url = "az://bucket_name/path";
let _ = cq.register_object_storage(&Url::parse(host_url).unwrap());
let provider = &cq.dfctx.runtime_env().object_store_registry;

let res = provider.get_store(&Url::from_str(host_url).unwrap());
let msg = match res {
Err(e) => format!("{e}"),
Ok(_) => "".to_string(),
Expand All @@ -321,13 +321,10 @@ mod tests {

#[test]
fn unknown_object_store_type() {
let unknown = "unknown://bucket_name/path";
let provider = ColumnQObjectStoreRegistry {};
let err = provider
.get_by_url(&Url::from_str(unknown).unwrap())
let mut cq = ColumnQ::new();
let err = cq
.register_object_storage(&Url::parse("unknown://bucket_name/path").unwrap())
.unwrap_err();
assert!(err
.to_string()
.contains("External error: Invalid table URI: Unsupported scheme: \"unknown\""))
assert!(err.to_string().contains("Unsupported scheme: \"unknown\""))
}
}
11 changes: 6 additions & 5 deletions columnq/src/io/object_store.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use crate::columnq::ColumnQObjectStoreRegistry;
use crate::error::ColumnQError;
use crate::table::TableSource;
use futures::TryStreamExt;
Expand All @@ -21,17 +20,18 @@ pub async fn partition_key_to_reader(
pub async fn partitions_from_path_iterator<'a, F, T, I>(
path_iter: I,
mut partition_reader: F,
dfctx: &datafusion::execution::context::SessionContext,
) -> Result<Vec<T>, ColumnQError>
where
I: Iterator<Item = &'a str>,
F: FnMut(std::io::Cursor<Vec<u8>>) -> Result<T, ColumnQError>,
{
let object_store_provider = ColumnQObjectStoreRegistry {};
let object_store_registry = dfctx.runtime_env().object_store_registry.clone();
let mut partitions = vec![];

for path_str in path_iter {
let url = &Url::from_str(path_str).unwrap();
let client = object_store_provider.get_by_url(url)?;
let client = object_store_registry.get_store(url)?;
let path = object_store::path::Path::from(&url.path()[1..]);
let reader = partition_key_to_reader(client.clone(), &path).await?;
partitions.push(partition_reader(reader)?);
Expand All @@ -44,13 +44,14 @@ pub async fn partitions_from_uri<'a, F, T>(
t: &'a TableSource,
_uri: URIReference<'a>,
mut partition_reader: F,
dfctx: &datafusion::execution::context::SessionContext,
) -> Result<Vec<T>, ColumnQError>
where
F: FnMut(std::io::Cursor<Vec<u8>>) -> Result<T, ColumnQError>,
{
let object_store_provider = ColumnQObjectStoreRegistry {};
let object_store_registry = dfctx.runtime_env().object_store_registry.clone();
let url = &Url::from_str(t.get_uri_str()).unwrap();
let client = object_store_provider.get_by_url(url)?;
let client = object_store_registry.get_store(url)?;
let mut partitions = vec![];

// url.path starts with "/", but object_store does not expect "/" at the beginning
Expand Down
Loading