Skip to content

Commit

Permalink
Add http(s) support to the command line (#8753)
Browse files Browse the repository at this point in the history
* Add http(s) support to the command line

* fmt

* Add documentation

* Add a test

* fmt

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
kcolford and alamb authored Feb 4, 2024
1 parent 86a2ab0 commit 840499f
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 11 deletions.
20 changes: 10 additions & 10 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion datafusion-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dirs = "4.0.0"
env_logger = "0.9"
futures = "0.3"
mimalloc = { version = "0.1", default-features = false }
object_store = { version = "0.9.0", features = ["aws", "gcp"] }
object_store = { version = "0.9.0", features = ["aws", "gcp", "http"] }
parking_lot = { version = "0.12" }
parquet = { version = "50.0.0", default-features = false }
regex = "1.8"
Expand Down
18 changes: 18 additions & 0 deletions datafusion-cli/src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ use datafusion::physical_plan::{collect, execute_stream};
use datafusion::prelude::SessionContext;
use datafusion::sql::{parser::DFParser, sqlparser::dialect::dialect_from_str};

use object_store::http::HttpBuilder;
use object_store::ObjectStore;
use rustyline::error::ReadlineError;
use rustyline::Editor;
Expand Down Expand Up @@ -281,6 +282,11 @@ async fn create_external_table(
let builder = get_gcs_object_store_builder(url, cmd)?;
Arc::new(builder.build()?) as Arc<dyn ObjectStore>
}
"http" | "https" => Arc::new(
HttpBuilder::new()
.with_url(url.origin().ascii_serialization())
.build()?,
) as Arc<dyn ObjectStore>,
_ => {
// for other types, try to get from the object_store_registry
ctx.runtime_env()
Expand Down Expand Up @@ -329,12 +335,24 @@ mod tests {
return plan_err!("LogicalPlan is not a CreateExternalTable");
}

// Ensure the URL is supported by the object store
ctx.runtime_env()
.object_store(ListingTableUrl::parse(location)?)?;

Ok(())
}

#[tokio::test]
async fn create_object_store_table_http() -> Result<()> {
// Should be OK
let location = "http://example.com/file.parquet";
let sql =
format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'");
create_external_table_test(location, &sql).await?;

Ok(())
}

#[tokio::test]
async fn create_object_store_table_s3() -> Result<()> {
let access_key_id = "fake_access_key_id";
Expand Down
21 changes: 21 additions & 0 deletions docs/source/user-guide/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,27 @@ STORED AS CSV
LOCATION '/path/to/aggregate_test_100.csv';
```
## Registering Remote Data Sources
`datafusion-cli` can read from remote locations using a variety of protocols.
For example to read from a remote parquet file via HTTP(S) you can use the following:
```sql
CREATE EXTERNAL TABLE hits
STORED AS PARQUET
LOCATION 'https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_1.parquet';
```
```sql
select count(*) from hits;
+----------+
| COUNT(*) |
+----------+
| 1000000 |
+----------+
1 row in set. Query took 0.344 seconds.
```
## Registering S3 Data Sources
[AWS S3](https://aws.amazon.com/s3/) data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement.
Expand Down

0 comments on commit 840499f

Please sign in to comment.