From e28fe0690bcc7c83ba8643e61088b85505c8ca4c Mon Sep 17 00:00:00 2001 From: kould Date: Mon, 18 Aug 2025 16:15:09 +0800 Subject: [PATCH 01/20] feat: `infer_schema` expands csv and ndjson support --- Cargo.lock | 2 + Cargo.toml | 2 + src/query/catalog/src/table_args.rs | 6 ++ src/query/service/Cargo.toml | 2 + .../infer_schema/infer_schema_table.rs | 6 +- .../src/table_functions/infer_schema/mod.rs | 2 +- .../infer_schema/{parquet.rs => source.rs} | 90 +++++++++++++++++-- .../infer_schema/table_args.rs | 7 ++ tests/data/csv/max_records.csv | 11 +++ tests/data/csv/mixed.csv | 4 + tests/data/csv/numbers_with_headers.csv | 19 ++++ tests/data/csv/ragged.csv | 5 ++ tests/data/ndjson/max_records.ndjson | 10 +++ tests/data/ndjson/mixed.ndjson | 3 + tests/data/ndjson/numbers.ndjson | 3 + tests/data/ndjson/ragged.ndjson | 4 + .../stage/formats/parquet/infer_schema.test | 57 ++++++++++++ 17 files changed, 221 insertions(+), 12 deletions(-) rename src/query/service/src/table_functions/infer_schema/{parquet.rs => source.rs} (68%) create mode 100644 tests/data/csv/max_records.csv create mode 100644 tests/data/csv/mixed.csv create mode 100644 tests/data/csv/numbers_with_headers.csv create mode 100644 tests/data/csv/ragged.csv create mode 100644 tests/data/ndjson/max_records.ndjson create mode 100644 tests/data/ndjson/mixed.ndjson create mode 100644 tests/data/ndjson/numbers.ndjson create mode 100644 tests/data/ndjson/ragged.ndjson diff --git a/Cargo.lock b/Cargo.lock index e39580a258d30..e282242eb03ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5159,8 +5159,10 @@ dependencies = [ "arrow-array", "arrow-buffer", "arrow-cast", + "arrow-csv", "arrow-flight", "arrow-ipc", + "arrow-json", "arrow-schema", "arrow-select", "arrow-udf-runtime", diff --git a/Cargo.toml b/Cargo.toml index a7036e01a4101..54a1ed641fc68 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -231,8 +231,10 @@ arrow = { version = "55" } arrow-array = { version = "55" } arrow-buffer = { version = "55" } arrow-cast = { version = "55", features = ["prettyprint"] } +arrow-csv = { version = "55" } arrow-data = { version = "55" } arrow-flight = { version = "55", features = ["flight-sql-experimental", "tls"] } +arrow-json = { version = "55" } arrow-ipc = { version = "55", features = ["lz4", "zstd"] } arrow-ord = { version = "55" } arrow-schema = { version = "55", features = ["serde"] } diff --git a/src/query/catalog/src/table_args.rs b/src/query/catalog/src/table_args.rs index dab8c366e9bad..4f1c26e1596f6 100644 --- a/src/query/catalog/src/table_args.rs +++ b/src/query/catalog/src/table_args.rs @@ -119,6 +119,12 @@ pub fn bool_value(value: &Scalar) -> Result { } } +pub fn i64_value(value: &Scalar) -> Result { + value.get_i64().ok_or_else(|| { + ErrorCode::BadArguments(format!("invalid value {value} expect to be i64 literal.")) + }) +} + pub fn string_literal(val: &str) -> Scalar { Scalar::String(val.to_string()) } diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index 0016398385d49..22e0b000d2376 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -23,7 +23,9 @@ io-uring = [ anyhow = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } +arrow-csv = { workspace = true } arrow-flight = { workspace = true } +arrow-json = { workspace = true } arrow-ipc = { workspace = true } arrow-schema = { workspace = true } arrow-select = { workspace = true } diff --git a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs index b9bb841f4281d..89f37eaf1aa7e 100644 --- a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs +++ b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs @@ -32,7 +32,7 @@ use databend_common_meta_app::schema::TableInfo; use databend_common_meta_app::schema::TableMeta; use databend_common_pipeline_core::Pipeline; -use super::parquet::ParquetInferSchemaSource; +use super::source::InferSchemaSource; use crate::sessions::TableContext; use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed; use crate::table_functions::TableFunction; @@ -114,9 +114,7 @@ impl Table for InferSchemaTable { _put_cache: bool, ) -> Result<()> { pipeline.add_source( - |output| { - ParquetInferSchemaSource::create(ctx.clone(), output, self.args_parsed.clone()) - }, + |output| InferSchemaSource::create(ctx.clone(), output, self.args_parsed.clone()), 1, )?; Ok(()) diff --git a/src/query/service/src/table_functions/infer_schema/mod.rs b/src/query/service/src/table_functions/infer_schema/mod.rs index 804499bf8fa56..7bc1731b442b4 100644 --- a/src/query/service/src/table_functions/infer_schema/mod.rs +++ b/src/query/service/src/table_functions/infer_schema/mod.rs @@ -13,7 +13,7 @@ // limitations under the License. mod infer_schema_table; -mod parquet; +mod source; mod table_args; pub use infer_schema_table::InferSchemaTable; diff --git a/src/query/service/src/table_functions/infer_schema/parquet.rs b/src/query/service/src/table_functions/infer_schema/source.rs similarity index 68% rename from src/query/service/src/table_functions/infer_schema/parquet.rs rename to src/query/service/src/table_functions/infer_schema/source.rs index 753971deab5b7..574c07bbf2322 100644 --- a/src/query/service/src/table_functions/infer_schema/parquet.rs +++ b/src/query/service/src/table_functions/infer_schema/source.rs @@ -13,8 +13,12 @@ // limitations under the License. use std::collections::BTreeMap; +use std::io::Cursor; use std::sync::Arc; +use arrow_csv::reader::Format; +use arrow_json::reader::infer_json_schema; +use arrow_schema::Schema as ArrowSchema; use databend_common_ast::ast::FileLocation; use databend_common_ast::ast::UriLocation; use databend_common_catalog::table_context::TableContext; @@ -26,7 +30,8 @@ use databend_common_expression::types::UInt64Type; use databend_common_expression::DataBlock; use databend_common_expression::FromData; use databend_common_expression::TableSchema; -use databend_common_meta_app::principal::StageFileFormatType; +use databend_common_meta_app::principal::CsvFileFormatParams; +use databend_common_meta_app::principal::FileFormatParams; use databend_common_meta_app::principal::StageType; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::ProcessorPtr; @@ -37,24 +42,25 @@ use databend_common_storage::init_stage_operator; use databend_common_storage::read_parquet_schema_async_rs; use databend_common_storage::StageFilesInfo; use databend_common_users::Object; +use opendal::Operator; use opendal::Scheme; use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA; use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed; -pub(crate) struct ParquetInferSchemaSource { +pub(crate) struct InferSchemaSource { is_finished: bool, ctx: Arc, args_parsed: InferSchemaArgsParsed, } -impl ParquetInferSchemaSource { +impl InferSchemaSource { pub fn create( ctx: Arc, output: Arc, args_parsed: InferSchemaArgsParsed, ) -> Result { - AsyncSourcer::create(ctx.clone(), output, ParquetInferSchemaSource { + AsyncSourcer::create(ctx.clone(), output, InferSchemaSource { is_finished: false, ctx, args_parsed, @@ -63,7 +69,7 @@ impl ParquetInferSchemaSource { } #[async_trait::async_trait] -impl AsyncSource for ParquetInferSchemaSource { +impl AsyncSource for InferSchemaSource { const NAME: &'static str = INFER_SCHEMA; #[async_backtrace::framed] @@ -127,9 +133,9 @@ impl AsyncSource for ParquetInferSchemaSource { Some(f) => self.ctx.get_file_format(f).await?, None => stage_info.file_format_params.clone(), }; - let schema = match (first_file.as_ref(), file_format_params.get_type()) { + let schema = match (first_file.as_ref(), file_format_params) { (None, _) => return Ok(None), - (Some(first_file), StageFileFormatType::Parquet) => { + (Some(first_file), FileFormatParams::Parquet(_)) => { let arrow_schema = read_parquet_schema_async_rs( &operator, &first_file.path, @@ -138,6 +144,27 @@ impl AsyncSource for ParquetInferSchemaSource { .await?; TableSchema::try_from(&arrow_schema)? } + (Some(first_file), FileFormatParams::Csv(params)) => { + let arrow_schema = read_csv_metadata_async( + &first_file.path, + &operator, + Some(first_file.size), + self.args_parsed.max_records, + ¶ms, + ) + .await?; + TableSchema::try_from(&arrow_schema)? + } + (Some(first_file), FileFormatParams::NdJson(_)) => { + let arrow_schema = read_json_metadata_async( + &first_file.path, + &operator, + Some(first_file.size), + self.args_parsed.max_records, + ) + .await?; + TableSchema::try_from(&arrow_schema)? + } _ => { return Err(ErrorCode::BadArguments( "infer_schema is currently limited to format Parquet", @@ -168,3 +195,52 @@ impl AsyncSource for ParquetInferSchemaSource { Ok(Some(block)) } } + +pub async fn read_csv_metadata_async( + path: &str, + operator: &Operator, + file_size: Option, + max_records: Option, + params: &CsvFileFormatParams, +) -> Result { + let file_size = match file_size { + None => operator.stat(path).await?.content_length(), + Some(n) => n, + }; + let escape = if params.escape.is_empty() { + None + } else { + Some(params.escape.as_bytes()[0]) + }; + + // TODO: It would be better if it could be read in the form of Read trait + let buf = operator.read_with(path).range(..file_size).await?.to_vec(); + let mut format = Format::default() + .with_delimiter(params.field_delimiter.as_bytes()[0]) + .with_quote(params.quote.as_bytes()[0]) + .with_header(params.headers != 0); + + if let Some(escape) = escape { + format = format.with_escape(escape); + } + let (schema, _) = format.infer_schema(Cursor::new(&buf), max_records)?; + + Ok(schema) +} + +pub async fn read_json_metadata_async( + path: &str, + operator: &Operator, + file_size: Option, + max_records: Option, +) -> Result { + let file_size = match file_size { + None => operator.stat(path).await?.content_length(), + Some(n) => n, + }; + // TODO: It would be better if it could be read in the form of Read trait + let buf = operator.read_with(path).range(..file_size).await?.to_vec(); + let (schema, _) = infer_json_schema(Cursor::new(&buf), max_records)?; + + Ok(schema) +} diff --git a/src/query/service/src/table_functions/infer_schema/table_args.rs b/src/query/service/src/table_functions/infer_schema/table_args.rs index 07d359d5985a5..4bbf0ef113713 100644 --- a/src/query/service/src/table_functions/infer_schema/table_args.rs +++ b/src/query/service/src/table_functions/infer_schema/table_args.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use databend_common_catalog::table_args::i64_value; use databend_common_catalog::table_args::TableArgs; use databend_common_exception::ErrorCode; use databend_common_exception::Result; @@ -24,6 +25,7 @@ pub(crate) struct InferSchemaArgsParsed { pub(crate) connection_name: Option, pub(crate) file_format: Option, pub(crate) files_info: StageFilesInfo, + pub(crate) max_records: Option, } impl InferSchemaArgsParsed { @@ -38,6 +40,7 @@ impl InferSchemaArgsParsed { files: None, pattern: None, }; + let mut max_records = None; for (k, v) in &args { match k.to_lowercase().as_str() { @@ -53,6 +56,9 @@ impl InferSchemaArgsParsed { "file_format" => { file_format = Some(string_value(v)?); } + "max_records_pre_file" => { + max_records = Some(i64_value(v)? as usize); + } _ => { return Err(ErrorCode::BadArguments(format!( "unknown param {} for infer_schema", @@ -70,6 +76,7 @@ impl InferSchemaArgsParsed { connection_name, file_format, files_info, + max_records, }) } } diff --git a/tests/data/csv/max_records.csv b/tests/data/csv/max_records.csv new file mode 100644 index 0000000000000..5e52f31e5dd5d --- /dev/null +++ b/tests/data/csv/max_records.csv @@ -0,0 +1,11 @@ +id,value +1,100 +2,200 +3,300 +4,400 +5,500 +6,foo +7,bar +8,baz +9,qux +10,quux diff --git a/tests/data/csv/mixed.csv b/tests/data/csv/mixed.csv new file mode 100644 index 0000000000000..203cdde68ced0 --- /dev/null +++ b/tests/data/csv/mixed.csv @@ -0,0 +1,4 @@ +id,name,score,active +1,Alice,88.5,true +2,Bob,92.0,false +3,Charlie,,true diff --git a/tests/data/csv/numbers_with_headers.csv b/tests/data/csv/numbers_with_headers.csv new file mode 100644 index 0000000000000..85e74e0d15564 --- /dev/null +++ b/tests/data/csv/numbers_with_headers.csv @@ -0,0 +1,19 @@ +id,value +0,1 +1,2 +2,3 +3,4 +4,5 +5,6 +6,7 +7,8 +8,9 +9,10 +10,11 +11,12 +12,13 +13,14 +14,15 +15,16 +16,17 +17,18 diff --git a/tests/data/csv/ragged.csv b/tests/data/csv/ragged.csv new file mode 100644 index 0000000000000..c0cdce65d93c2 --- /dev/null +++ b/tests/data/csv/ragged.csv @@ -0,0 +1,5 @@ +id,value,comment +1,10,ok +2,20 +3,30,missing one field +4 diff --git a/tests/data/ndjson/max_records.ndjson b/tests/data/ndjson/max_records.ndjson new file mode 100644 index 0000000000000..079f2c82061f1 --- /dev/null +++ b/tests/data/ndjson/max_records.ndjson @@ -0,0 +1,10 @@ +{"id": 1, "value": 100} +{"id": 2, "value": 200} +{"id": 3, "value": 300} +{"id": 4, "value": 400} +{"id": 5, "value": 500} +{"id": 6, "value": "foo"} +{"id": 7, "value": "bar"} +{"id": 8, "value": "baz"} +{"id": 9, "value": "qux"} +{"id": 10, "value": "quux"} diff --git a/tests/data/ndjson/mixed.ndjson b/tests/data/ndjson/mixed.ndjson new file mode 100644 index 0000000000000..f9c139d2f5175 --- /dev/null +++ b/tests/data/ndjson/mixed.ndjson @@ -0,0 +1,3 @@ +{"id": 1, "name": "Alice", "score": 88.5, "active": true} +{"id": 2, "name": "Bob", "score": 92.0, "active": false} +{"id": 3, "name": "Charlie", "score": null, "active": true} diff --git a/tests/data/ndjson/numbers.ndjson b/tests/data/ndjson/numbers.ndjson new file mode 100644 index 0000000000000..aecddc3762d07 --- /dev/null +++ b/tests/data/ndjson/numbers.ndjson @@ -0,0 +1,3 @@ +{"id": 1, "value": 100} +{"id": 2, "value": 200} +{"id": 3, "value": 300} diff --git a/tests/data/ndjson/ragged.ndjson b/tests/data/ndjson/ragged.ndjson new file mode 100644 index 0000000000000..847a327073c2c --- /dev/null +++ b/tests/data/ndjson/ragged.ndjson @@ -0,0 +1,4 @@ +{"id": 1, "value": 10, "comment": "ok"} +{"id": 2, "value": 20} +{"id": 3, "value": 30, "comment": "missing one field"} +{"id": 4} diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index 7304db2b5d09d..4502b9c088426 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -66,3 +66,60 @@ select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parqu ---- id INT 0 0 t TUPLE(A INT32, B STRING) 0 1 + +# CSV +statement ok +create or replace file format head_csv_format type = 'CSV' field_delimiter = ',' skip_header = 1; + +query TTBI +select * from infer_schema(location => '@data/csv/numbers_with_headers.csv', file_format => 'CSV'); +---- +column_1 VARCHAR 1 0 +column_2 VARCHAR 1 1 + +query TTBI +select * from infer_schema(location => '@data/csv/numbers_with_headers.csv', file_format => 'head_csv_format'); +---- +id BIGINT 1 0 +value BIGINT 1 1 + +statement error +select * from infer_schema(location => '@data/csv/ragged.csv', file_format => 'head_csv_format'); + +query TTBI +select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format'); +---- +id BIGINT 1 0 +value VARCHAR 1 1 + +query TTBI +select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5); +---- +id BIGINT 1 0 +value BIGINT 1 1 + +# NDJSON +query TTBI +select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON'); +---- +id BIGINT 1 0 +value BIGINT 1 1 + +query TTBI +select * from infer_schema(location => '@data/ndjson/ragged.ndjson', file_format => 'NDJSON'); +---- +id BIGINT 1 0 +value BIGINT 1 1 +comment VARCHAR 1 2 + +query TTBI +select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON'); +---- +id BIGINT 1 0 +value VARCHAR 1 1 + +query TTBI +select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5); +---- +id BIGINT 1 0 +value BIGINT 1 1 From f39edd1e30ce80e54001c1de499063486abc8ad2 Mon Sep 17 00:00:00 2001 From: kould Date: Mon, 18 Aug 2025 16:31:15 +0800 Subject: [PATCH 02/20] chore: codefmt --- Cargo.toml | 2 +- src/query/service/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 54a1ed641fc68..19b0f0017a71b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -234,8 +234,8 @@ arrow-cast = { version = "55", features = ["prettyprint"] } arrow-csv = { version = "55" } arrow-data = { version = "55" } arrow-flight = { version = "55", features = ["flight-sql-experimental", "tls"] } -arrow-json = { version = "55" } arrow-ipc = { version = "55", features = ["lz4", "zstd"] } +arrow-json = { version = "55" } arrow-ord = { version = "55" } arrow-schema = { version = "55", features = ["serde"] } arrow-select = { version = "55" } diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index 22e0b000d2376..2bc06f9d0973f 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -25,8 +25,8 @@ arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-csv = { workspace = true } arrow-flight = { workspace = true } -arrow-json = { workspace = true } arrow-ipc = { workspace = true } +arrow-json = { workspace = true } arrow-schema = { workspace = true } arrow-select = { workspace = true } arrow-udf-runtime = { workspace = true } From cc4d62eca6c3e059532699b54d02b0b445adab3a Mon Sep 17 00:00:00 2001 From: kould Date: Tue, 19 Aug 2025 12:57:06 +0800 Subject: [PATCH 03/20] chore: add check on csv and ndjson compression --- .../table_functions/infer_schema/source.rs | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs index 574c07bbf2322..8f1fb795530cb 100644 --- a/src/query/service/src/table_functions/infer_schema/source.rs +++ b/src/query/service/src/table_functions/infer_schema/source.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::collections::BTreeMap; -use std::io::Cursor; use std::sync::Arc; use arrow_csv::reader::Format; @@ -32,6 +31,7 @@ use databend_common_expression::FromData; use databend_common_expression::TableSchema; use databend_common_meta_app::principal::CsvFileFormatParams; use databend_common_meta_app::principal::FileFormatParams; +use databend_common_meta_app::principal::StageFileCompression; use databend_common_meta_app::principal::StageType; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::ProcessorPtr; @@ -145,6 +145,11 @@ impl AsyncSource for InferSchemaSource { TableSchema::try_from(&arrow_schema)? } (Some(first_file), FileFormatParams::Csv(params)) => { + if params.compression != StageFileCompression::None { + return Err(ErrorCode::InvalidCompressionData( + "Compressed CSV files are not supported", + )); + } let arrow_schema = read_csv_metadata_async( &first_file.path, &operator, @@ -155,7 +160,12 @@ impl AsyncSource for InferSchemaSource { .await?; TableSchema::try_from(&arrow_schema)? } - (Some(first_file), FileFormatParams::NdJson(_)) => { + (Some(first_file), FileFormatParams::NdJson(params)) => { + if params.compression != StageFileCompression::None { + return Err(ErrorCode::InvalidCompressionData( + "Compressed NDJSON files are not supported", + )); + } let arrow_schema = read_json_metadata_async( &first_file.path, &operator, @@ -167,7 +177,7 @@ impl AsyncSource for InferSchemaSource { } _ => { return Err(ErrorCode::BadArguments( - "infer_schema is currently limited to format Parquet", + "infer_schema is currently limited to format Parquet, CSV and NDJSON", )); } }; @@ -214,7 +224,7 @@ pub async fn read_csv_metadata_async( }; // TODO: It would be better if it could be read in the form of Read trait - let buf = operator.read_with(path).range(..file_size).await?.to_vec(); + let buf = operator.read_with(path).range(..file_size).await?; let mut format = Format::default() .with_delimiter(params.field_delimiter.as_bytes()[0]) .with_quote(params.quote.as_bytes()[0]) @@ -223,7 +233,7 @@ pub async fn read_csv_metadata_async( if let Some(escape) = escape { format = format.with_escape(escape); } - let (schema, _) = format.infer_schema(Cursor::new(&buf), max_records)?; + let (schema, _) = format.infer_schema(buf, max_records)?; Ok(schema) } @@ -239,8 +249,8 @@ pub async fn read_json_metadata_async( Some(n) => n, }; // TODO: It would be better if it could be read in the form of Read trait - let buf = operator.read_with(path).range(..file_size).await?.to_vec(); - let (schema, _) = infer_json_schema(Cursor::new(&buf), max_records)?; + let buf = operator.read_with(path).range(..file_size).await?; + let (schema, _) = infer_json_schema(buf, max_records)?; Ok(schema) } From 1481aa69ca0132a72874c982a497286893f9a712 Mon Sep 17 00:00:00 2001 From: kould Date: Tue, 19 Aug 2025 17:52:12 +0800 Subject: [PATCH 04/20] chore: add `max_bytes` --- .../table_functions/infer_schema/source.rs | 15 ++++++++---- .../infer_schema/table_args.rs | 6 +++++ .../stage/formats/parquet/infer_schema.test | 24 +++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs index 8f1fb795530cb..b0ad8663803aa 100644 --- a/src/query/service/src/table_functions/infer_schema/source.rs +++ b/src/query/service/src/table_functions/infer_schema/source.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp; use std::collections::BTreeMap; use std::sync::Arc; @@ -48,6 +49,8 @@ use opendal::Scheme; use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA; use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed; +const DEFAULT_MAX_BYTES: u64 = 1 * 1024 * 1024; + pub(crate) struct InferSchemaSource { is_finished: bool, ctx: Arc, @@ -154,6 +157,7 @@ impl AsyncSource for InferSchemaSource { &first_file.path, &operator, Some(first_file.size), + self.args_parsed.max_bytes, self.args_parsed.max_records, ¶ms, ) @@ -170,6 +174,7 @@ impl AsyncSource for InferSchemaSource { &first_file.path, &operator, Some(first_file.size), + self.args_parsed.max_bytes, self.args_parsed.max_records, ) .await?; @@ -210,6 +215,7 @@ pub async fn read_csv_metadata_async( path: &str, operator: &Operator, file_size: Option, + max_bytes: Option, max_records: Option, params: &CsvFileFormatParams, ) -> Result { @@ -223,8 +229,8 @@ pub async fn read_csv_metadata_async( Some(params.escape.as_bytes()[0]) }; - // TODO: It would be better if it could be read in the form of Read trait - let buf = operator.read_with(path).range(..file_size).await?; + let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size); + let buf = operator.read_with(path).range(..bytes_len).await?; let mut format = Format::default() .with_delimiter(params.field_delimiter.as_bytes()[0]) .with_quote(params.quote.as_bytes()[0]) @@ -242,14 +248,15 @@ pub async fn read_json_metadata_async( path: &str, operator: &Operator, file_size: Option, + max_bytes: Option, max_records: Option, ) -> Result { let file_size = match file_size { None => operator.stat(path).await?.content_length(), Some(n) => n, }; - // TODO: It would be better if it could be read in the form of Read trait - let buf = operator.read_with(path).range(..file_size).await?; + let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size); + let buf = operator.read_with(path).range(..bytes_len).await?; let (schema, _) = infer_json_schema(buf, max_records)?; Ok(schema) diff --git a/src/query/service/src/table_functions/infer_schema/table_args.rs b/src/query/service/src/table_functions/infer_schema/table_args.rs index 4bbf0ef113713..902807c75b580 100644 --- a/src/query/service/src/table_functions/infer_schema/table_args.rs +++ b/src/query/service/src/table_functions/infer_schema/table_args.rs @@ -26,6 +26,7 @@ pub(crate) struct InferSchemaArgsParsed { pub(crate) file_format: Option, pub(crate) files_info: StageFilesInfo, pub(crate) max_records: Option, + pub(crate) max_bytes: Option, } impl InferSchemaArgsParsed { @@ -41,6 +42,7 @@ impl InferSchemaArgsParsed { pattern: None, }; let mut max_records = None; + let mut max_bytes = None; for (k, v) in &args { match k.to_lowercase().as_str() { @@ -59,6 +61,9 @@ impl InferSchemaArgsParsed { "max_records_pre_file" => { max_records = Some(i64_value(v)? as usize); } + "max_bytes" => { + max_bytes = Some(i64_value(v)? as u64); + } _ => { return Err(ErrorCode::BadArguments(format!( "unknown param {} for infer_schema", @@ -77,6 +82,7 @@ impl InferSchemaArgsParsed { file_format, files_info, max_records, + max_bytes, }) } } diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index 4502b9c088426..5ddb5f48152b7 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -98,6 +98,18 @@ select * from infer_schema(location => '@data/csv/max_records.csv', file_format id BIGINT 1 0 value BIGINT 1 1 +# max_records.csv is 71 bytes +# enough bytes +query TTBI +select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 15); +---- +id BIGINT 1 0 +value BIGINT 1 1 + +# not enough bytes +statement error +select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 10); + # NDJSON query TTBI select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON'); @@ -123,3 +135,15 @@ select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_f ---- id BIGINT 1 0 value BIGINT 1 1 + +# max_records.csv is 252 bytes +# enough bytes +query TTBI +select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 130); +---- +id BIGINT 1 0 +value BIGINT 1 1 + +# not enough bytes +statement error +select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 50); From 54ed2086e2ee4ac8925572bcf2f21623b9bd6c13 Mon Sep 17 00:00:00 2001 From: kould Date: Wed, 20 Aug 2025 11:57:04 +0800 Subject: [PATCH 05/20] feat: support compressed files for infer_schema csv ndjson --- Cargo.lock | 1 + src/query/service/Cargo.toml | 1 + .../table_functions/infer_schema/source.rs | 38 +++++++++++------- tests/data/csv/max_records.csv.zst | Bin 0 -> 76 bytes tests/data/csv/max_records.zip | Bin 0 -> 271 bytes tests/data/ndjson/max_records.ndjson.zst | Bin 0 -> 110 bytes tests/data/ndjson/max_records.zip | Bin 0 -> 302 bytes .../stage/formats/parquet/infer_schema.test | 24 +++++++++++ 8 files changed, 49 insertions(+), 15 deletions(-) create mode 100644 tests/data/csv/max_records.csv.zst create mode 100644 tests/data/csv/max_records.zip create mode 100644 tests/data/ndjson/max_records.ndjson.zst create mode 100644 tests/data/ndjson/max_records.zip diff --git a/Cargo.lock b/Cargo.lock index e282242eb03ce..d72ee7aca19a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5193,6 +5193,7 @@ dependencies = [ "databend-common-catalog", "databend-common-cloud-control", "databend-common-column", + "databend-common-compress", "databend-common-config", "databend-common-exception", "databend-common-expression", diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index 2bc06f9d0973f..f6b0302b314ea 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -56,6 +56,7 @@ databend-common-cache = { workspace = true } databend-common-catalog = { workspace = true } databend-common-cloud-control = { workspace = true } databend-common-column = { workspace = true } +databend-common-compress = { workspace = true } databend-common-config = { workspace = true } databend-common-exception = { workspace = true } databend-common-expression = { workspace = true } diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs index b0ad8663803aa..9868930da2ad1 100644 --- a/src/query/service/src/table_functions/infer_schema/source.rs +++ b/src/query/service/src/table_functions/infer_schema/source.rs @@ -14,6 +14,7 @@ use std::cmp; use std::collections::BTreeMap; +use std::io::Cursor; use std::sync::Arc; use arrow_csv::reader::Format; @@ -22,6 +23,8 @@ use arrow_schema::Schema as ArrowSchema; use databend_common_ast::ast::FileLocation; use databend_common_ast::ast::UriLocation; use databend_common_catalog::table_context::TableContext; +use databend_common_compress::CompressAlgorithm; +use databend_common_compress::DecompressDecoder; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::types::BooleanType; @@ -32,7 +35,6 @@ use databend_common_expression::FromData; use databend_common_expression::TableSchema; use databend_common_meta_app::principal::CsvFileFormatParams; use databend_common_meta_app::principal::FileFormatParams; -use databend_common_meta_app::principal::StageFileCompression; use databend_common_meta_app::principal::StageType; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::ProcessorPtr; @@ -148,11 +150,6 @@ impl AsyncSource for InferSchemaSource { TableSchema::try_from(&arrow_schema)? } (Some(first_file), FileFormatParams::Csv(params)) => { - if params.compression != StageFileCompression::None { - return Err(ErrorCode::InvalidCompressionData( - "Compressed CSV files are not supported", - )); - } let arrow_schema = read_csv_metadata_async( &first_file.path, &operator, @@ -165,11 +162,6 @@ impl AsyncSource for InferSchemaSource { TableSchema::try_from(&arrow_schema)? } (Some(first_file), FileFormatParams::NdJson(params)) => { - if params.compression != StageFileCompression::None { - return Err(ErrorCode::InvalidCompressionData( - "Compressed NDJSON files are not supported", - )); - } let arrow_schema = read_json_metadata_async( &first_file.path, &operator, @@ -230,7 +222,15 @@ pub async fn read_csv_metadata_async( }; let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size); - let buf = operator.read_with(path).range(..bytes_len).await?; + let mut buf = operator.read_with(path).range(..bytes_len).await?.to_vec(); + + if let Some(algo) = CompressAlgorithm::from_path(path) { + buf = if CompressAlgorithm::Zip == algo { + DecompressDecoder::decompress_all_zip(&buf)? + } else { + DecompressDecoder::new(algo).decompress_batch(&buf)? + }; + } let mut format = Format::default() .with_delimiter(params.field_delimiter.as_bytes()[0]) .with_quote(params.quote.as_bytes()[0]) @@ -239,7 +239,7 @@ pub async fn read_csv_metadata_async( if let Some(escape) = escape { format = format.with_escape(escape); } - let (schema, _) = format.infer_schema(buf, max_records)?; + let (schema, _) = format.infer_schema(Cursor::new(buf), max_records)?; Ok(schema) } @@ -256,8 +256,16 @@ pub async fn read_json_metadata_async( Some(n) => n, }; let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size); - let buf = operator.read_with(path).range(..bytes_len).await?; - let (schema, _) = infer_json_schema(buf, max_records)?; + let mut buf = operator.read_with(path).range(..bytes_len).await?.to_vec(); + + if let Some(algo) = CompressAlgorithm::from_path(path) { + buf = if CompressAlgorithm::Zip == algo { + DecompressDecoder::decompress_all_zip(&buf)? + } else { + DecompressDecoder::new(algo).decompress_batch(&buf)? + }; + } + let (schema, _) = infer_json_schema(Cursor::new(buf), max_records)?; Ok(schema) } diff --git a/tests/data/csv/max_records.csv.zst b/tests/data/csv/max_records.csv.zst new file mode 100644 index 0000000000000000000000000000000000000000..ef35edae5da5e34bef543852e4758960dd95b64d GIT binary patch literal 76 zcmV-S0JHxnwJ-f7NBsc+a>Nc5pg9Lb2+(R|Kj;4dYdP`kvFU%l7Kxx@;s^kPwi0zWnk5JT literal 0 HcmV?d00001 diff --git a/tests/data/csv/max_records.zip b/tests/data/csv/max_records.zip new file mode 100644 index 0000000000000000000000000000000000000000..baea0be135d7f49854fec8335da2e6fd7f5a6a43 GIT binary patch literal 271 zcmWIWW@Zs#-~d9SV3BABC~ybT{0s^Vxrr6=MXAa8MJdI4$;D-%A-oLi3PwvaV7Rn` zn}Lz#1v3K!m^kfoQPca3rteduD~2Wp4~#AuGEF*Wb# literal 0 HcmV?d00001 diff --git a/tests/data/ndjson/max_records.ndjson.zst b/tests/data/ndjson/max_records.ndjson.zst new file mode 100644 index 0000000000000000000000000000000000000000..77821a433bd6ffc639e16213f27f76bd13c023b1 GIT binary patch literal 110 zcmV-!0FnPFwJ-f7{0##DbOU=LX=EZgATcZ;B6eYHb!9LxeF`!%Gcz Q14e+kJCnFomeSvl>ovS2$^ZZW literal 0 HcmV?d00001 diff --git a/tests/data/ndjson/max_records.zip b/tests/data/ndjson/max_records.zip new file mode 100644 index 0000000000000000000000000000000000000000..02da2fa12d2064ef2b8a433cda8806c76ebd8446 GIT binary patch literal 302 zcmWIWW@Zs#-~d9SV3BABDEI@Ug%}hVauX}!i&B&Gi&Bd9@=~&j^YcPOcp2CgjFx7= zaA^fM10%}|W(Ec@u{!LOPvE8?gDZgnTHa?geOWj1@?`B?rTECr1B8x&k($RNAX)}S zX&#TfGJpQ`Dv&HrnYrYMQcDl(Dj!j&V?vS3Hf&*1OwMS~Vx7j!5a7+uQMKrXa0Jk0 qAlC(WGct)VBV3Lw2XZ+KY-t3sNcTs8H!B-RCnFFh0O=hd4g&x_E>wL0 literal 0 HcmV?d00001 diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index 5ddb5f48152b7..e3a1a99748098 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -98,6 +98,18 @@ select * from infer_schema(location => '@data/csv/max_records.csv', file_format id BIGINT 1 0 value BIGINT 1 1 +query TTBI +select * from infer_schema(location => '@data/csv/max_records.zip', file_format => 'head_csv_format', max_records_pre_file => 5); +---- +id BIGINT 1 0 +value BIGINT 1 1 + +query TTBI +select * from infer_schema(location => '@data/csv/max_records.csv.zst', file_format => 'head_csv_format', max_records_pre_file => 5); +---- +id BIGINT 1 0 +value BIGINT 1 1 + # max_records.csv is 71 bytes # enough bytes query TTBI @@ -136,6 +148,18 @@ select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_f id BIGINT 1 0 value BIGINT 1 1 +query TTBI +select * from infer_schema(location => '@data/ndjson/max_records.zip', file_format => 'NDJSON', max_records_pre_file => 5); +---- +id BIGINT 1 0 +value BIGINT 1 1 + +query TTBI +select * from infer_schema(location => '@data/ndjson/max_records.ndjson.zst', file_format => 'NDJSON', max_records_pre_file => 5); +---- +id BIGINT 1 0 +value BIGINT 1 1 + # max_records.csv is 252 bytes # enough bytes query TTBI From 7075934392e6201bc7f2e99ed6997898527bb7b8 Mon Sep 17 00:00:00 2001 From: kould Date: Wed, 20 Aug 2025 13:32:15 +0800 Subject: [PATCH 06/20] chore: add xz on `infer_schema.test` --- tests/data/csv/max_records.csv.xz | Bin 0 -> 124 bytes tests/data/ndjson/max_records.ndjson.xz | Bin 0 -> 144 bytes .../stage/formats/parquet/infer_schema.test | 12 ++++++++++++ 3 files changed, 12 insertions(+) create mode 100644 tests/data/csv/max_records.csv.xz create mode 100644 tests/data/ndjson/max_records.ndjson.xz diff --git a/tests/data/csv/max_records.csv.xz b/tests/data/csv/max_records.csv.xz new file mode 100644 index 0000000000000000000000000000000000000000..25a16f4f85295057ef5da488e59b641fef51112c GIT binary patch literal 124 zcmexsUKJ6=z`*kC+7>q^21Q0O1_p)_{ill`Ft{<;#xj`9WNc$Bdg{8m_}X{(nI;S( z*Nnuv7b!o8-kEZ=sdP_;jZ|l=^a7!mOk!O8C--kJlRrOG!=rANLiXB2KMnwmV(_&I c=`7>YTF<~3>CW2jzF2q^21Q0O1_p)_{ill`F#Klljb*Uy=u~T*99&;2z*eBul+>NN za*p`vFW;vs23BSno>#lz&y$*y>v^Woo%`_Y7j1C{X7~3tUz4fcc4F!Qw!|&_+46VP wU-%&!5jyo~;}@XG3=FS*QzLU!J8BsiGygCG`DFpuZf-o1{+kIT$r2d_06+LSfdBvi literal 0 HcmV?d00001 diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index e3a1a99748098..4ed7b1dc50ef2 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -118,6 +118,12 @@ select * from infer_schema(location => '@data/csv/max_records.csv', file_format id BIGINT 1 0 value BIGINT 1 1 +query TTBI +select * from infer_schema(location => '@data/csv/max_records.csv.xz', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 70); +---- +id BIGINT 1 0 +value BIGINT 1 1 + # not enough bytes statement error select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 10); @@ -168,6 +174,12 @@ select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_f id BIGINT 1 0 value BIGINT 1 1 +query TTBI +select * from infer_schema(location => '@data/ndjson/max_records.ndjson.xz', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 80) +---- +id BIGINT 1 0 +value BIGINT 1 1 + # not enough bytes statement error select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 50); From 7ef9f88cbb15c542c46347cfca2bc1a0be93d7f2 Mon Sep 17 00:00:00 2001 From: kould Date: Wed, 20 Aug 2025 13:35:38 +0800 Subject: [PATCH 07/20] chore: codefmt --- src/query/service/src/table_functions/infer_schema/source.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs index 9868930da2ad1..aa0bf0096a521 100644 --- a/src/query/service/src/table_functions/infer_schema/source.rs +++ b/src/query/service/src/table_functions/infer_schema/source.rs @@ -51,7 +51,7 @@ use opendal::Scheme; use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA; use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed; -const DEFAULT_MAX_BYTES: u64 = 1 * 1024 * 1024; +const DEFAULT_MAX_BYTES: u64 = 1024 * 1024; pub(crate) struct InferSchemaSource { is_finished: bool, @@ -161,7 +161,7 @@ impl AsyncSource for InferSchemaSource { .await?; TableSchema::try_from(&arrow_schema)? } - (Some(first_file), FileFormatParams::NdJson(params)) => { + (Some(first_file), FileFormatParams::NdJson(_)) => { let arrow_schema = read_json_metadata_async( &first_file.path, &operator, From 69dbbd3d015cdb38d589f2b3a9f3b4d50937cf31 Mon Sep 17 00:00:00 2001 From: kould Date: Wed, 20 Aug 2025 16:59:54 +0800 Subject: [PATCH 08/20] feat(infer_schema): remove max_bytes and automatically infer the length when max_records is present --- .../table_functions/infer_schema/source.rs | 114 +++++++++--------- .../infer_schema/table_args.rs | 6 - .../stage/formats/parquet/infer_schema.test | 16 +-- 3 files changed, 64 insertions(+), 72 deletions(-) diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs index aa0bf0096a521..b4d20c22386da 100644 --- a/src/query/service/src/table_functions/infer_schema/source.rs +++ b/src/query/service/src/table_functions/infer_schema/source.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::borrow::Cow; use std::cmp; use std::collections::BTreeMap; use std::io::Cursor; @@ -19,7 +20,9 @@ use std::sync::Arc; use arrow_csv::reader::Format; use arrow_json::reader::infer_json_schema; +use arrow_schema::ArrowError; use arrow_schema::Schema as ArrowSchema; +use bytes::BufMut; use databend_common_ast::ast::FileLocation; use databend_common_ast::ast::UriLocation; use databend_common_catalog::table_context::TableContext; @@ -33,7 +36,6 @@ use databend_common_expression::types::UInt64Type; use databend_common_expression::DataBlock; use databend_common_expression::FromData; use databend_common_expression::TableSchema; -use databend_common_meta_app::principal::CsvFileFormatParams; use databend_common_meta_app::principal::FileFormatParams; use databend_common_meta_app::principal::StageType; use databend_common_pipeline_core::processors::OutputPort; @@ -51,7 +53,7 @@ use opendal::Scheme; use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA; use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed; -const DEFAULT_MAX_BYTES: u64 = 1024 * 1024; +const DEFAULT_BYTES: u64 = 20; pub(crate) struct InferSchemaSource { is_finished: bool, @@ -150,24 +152,37 @@ impl AsyncSource for InferSchemaSource { TableSchema::try_from(&arrow_schema)? } (Some(first_file), FileFormatParams::Csv(params)) => { - let arrow_schema = read_csv_metadata_async( + let escape = if params.escape.is_empty() { + None + } else { + Some(params.escape.as_bytes()[0]) + }; + + let mut format = Format::default() + .with_delimiter(params.field_delimiter.as_bytes()[0]) + .with_quote(params.quote.as_bytes()[0]) + .with_header(params.headers != 0); + if let Some(escape) = escape { + format = format.with_escape(escape); + } + + let arrow_schema = read_metadata_async( &first_file.path, &operator, Some(first_file.size), - self.args_parsed.max_bytes, self.args_parsed.max_records, - ¶ms, + |reader, max_record| format.infer_schema(reader, max_record), ) .await?; TableSchema::try_from(&arrow_schema)? } (Some(first_file), FileFormatParams::NdJson(_)) => { - let arrow_schema = read_json_metadata_async( + let arrow_schema = read_metadata_async( &first_file.path, &operator, Some(first_file.size), - self.args_parsed.max_bytes, self.args_parsed.max_records, + |reader, max_record| infer_json_schema(reader, max_record), ) .await?; TableSchema::try_from(&arrow_schema)? @@ -203,69 +218,60 @@ impl AsyncSource for InferSchemaSource { } } -pub async fn read_csv_metadata_async( +pub async fn read_metadata_async< + F: Fn(Cursor<&[u8]>, Option) -> std::result::Result<(ArrowSchema, usize), ArrowError>, +>( path: &str, operator: &Operator, file_size: Option, - max_bytes: Option, max_records: Option, - params: &CsvFileFormatParams, + func_infer_schema: F, ) -> Result { let file_size = match file_size { None => operator.stat(path).await?.content_length(), Some(n) => n, }; - let escape = if params.escape.is_empty() { - None - } else { - Some(params.escape.as_bytes()[0]) - }; - - let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size); - let mut buf = operator.read_with(path).range(..bytes_len).await?.to_vec(); - - if let Some(algo) = CompressAlgorithm::from_path(path) { - buf = if CompressAlgorithm::Zip == algo { - DecompressDecoder::decompress_all_zip(&buf)? + let algo = CompressAlgorithm::from_path(path); + let mut buf = Vec::new(); + let mut offset: u64 = 0; + let mut chunk_size: u64 = + if max_records.is_none() || matches!(algo, Some(CompressAlgorithm::Zip)) { + file_size } else { - DecompressDecoder::new(algo).decompress_batch(&buf)? + DEFAULT_BYTES }; - } - let mut format = Format::default() - .with_delimiter(params.field_delimiter.as_bytes()[0]) - .with_quote(params.quote.as_bytes()[0]) - .with_header(params.headers != 0); - if let Some(escape) = escape { - format = format.with_escape(escape); - } - let (schema, _) = format.infer_schema(Cursor::new(buf), max_records)?; + loop { + let end = cmp::min(offset + chunk_size, file_size); - Ok(schema) -} + let chunk = operator.read_with(path).range(offset..end).await?; + buf.put(chunk); -pub async fn read_json_metadata_async( - path: &str, - operator: &Operator, - file_size: Option, - max_bytes: Option, - max_records: Option, -) -> Result { - let file_size = match file_size { - None => operator.stat(path).await?.content_length(), - Some(n) => n, - }; - let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size); - let mut buf = operator.read_with(path).range(..bytes_len).await?.to_vec(); + offset = end; - if let Some(algo) = CompressAlgorithm::from_path(path) { - buf = if CompressAlgorithm::Zip == algo { - DecompressDecoder::decompress_all_zip(&buf)? + let bytes = if let Some(algo) = algo { + let decompress_bytes = if CompressAlgorithm::Zip == algo { + DecompressDecoder::decompress_all_zip(&buf)? + } else { + DecompressDecoder::new(algo).decompress_batch(&buf)? + }; + Cow::Owned(decompress_bytes) } else { - DecompressDecoder::new(algo).decompress_batch(&buf)? + Cow::Borrowed(&buf) }; - } - let (schema, _) = infer_json_schema(Cursor::new(buf), max_records)?; - Ok(schema) + if !bytes.is_empty() || offset >= file_size { + match func_infer_schema(Cursor::new(bytes.as_slice()), max_records) { + Ok((schema, _)) => { + return Ok(schema); + } + Err(err) => { + if offset >= file_size { + return Err(ErrorCode::from(err)); + } + } + } + } + chunk_size = cmp::min(chunk_size * 2, file_size - offset); + } } diff --git a/src/query/service/src/table_functions/infer_schema/table_args.rs b/src/query/service/src/table_functions/infer_schema/table_args.rs index 902807c75b580..4bbf0ef113713 100644 --- a/src/query/service/src/table_functions/infer_schema/table_args.rs +++ b/src/query/service/src/table_functions/infer_schema/table_args.rs @@ -26,7 +26,6 @@ pub(crate) struct InferSchemaArgsParsed { pub(crate) file_format: Option, pub(crate) files_info: StageFilesInfo, pub(crate) max_records: Option, - pub(crate) max_bytes: Option, } impl InferSchemaArgsParsed { @@ -42,7 +41,6 @@ impl InferSchemaArgsParsed { pattern: None, }; let mut max_records = None; - let mut max_bytes = None; for (k, v) in &args { match k.to_lowercase().as_str() { @@ -61,9 +59,6 @@ impl InferSchemaArgsParsed { "max_records_pre_file" => { max_records = Some(i64_value(v)? as usize); } - "max_bytes" => { - max_bytes = Some(i64_value(v)? as u64); - } _ => { return Err(ErrorCode::BadArguments(format!( "unknown param {} for infer_schema", @@ -82,7 +77,6 @@ impl InferSchemaArgsParsed { file_format, files_info, max_records, - max_bytes, }) } } diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index 4ed7b1dc50ef2..2f8b495f5f9d7 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -113,21 +113,17 @@ value BIGINT 1 1 # max_records.csv is 71 bytes # enough bytes query TTBI -select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 15); +select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5); ---- id BIGINT 1 0 value BIGINT 1 1 query TTBI -select * from infer_schema(location => '@data/csv/max_records.csv.xz', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 70); +select * from infer_schema(location => '@data/csv/max_records.csv.xz', file_format => 'head_csv_format', max_records_pre_file => 5); ---- id BIGINT 1 0 value BIGINT 1 1 -# not enough bytes -statement error -select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 10); - # NDJSON query TTBI select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON'); @@ -169,17 +165,13 @@ value BIGINT 1 1 # max_records.csv is 252 bytes # enough bytes query TTBI -select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 130); +select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5); ---- id BIGINT 1 0 value BIGINT 1 1 query TTBI -select * from infer_schema(location => '@data/ndjson/max_records.ndjson.xz', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 80) +select * from infer_schema(location => '@data/ndjson/max_records.ndjson.xz', file_format => 'NDJSON', max_records_pre_file => 5) ---- id BIGINT 1 0 value BIGINT 1 1 - -# not enough bytes -statement error -select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 50); From 684918cdc2be0477d739c52055cf4baf314ca03d Mon Sep 17 00:00:00 2001 From: kould Date: Wed, 20 Aug 2025 21:56:41 +0800 Subject: [PATCH 09/20] test: add more type test for infer_schema --- .../table_functions/infer_schema/source.rs | 6 ++-- tests/data/csv/types.csv | 4 +++ tests/data/ndjson/types.ndjson | 3 ++ .../stage/formats/parquet/infer_schema.test | 30 ++++++++++++++++--- 4 files changed, 37 insertions(+), 6 deletions(-) create mode 100644 tests/data/csv/types.csv create mode 100644 tests/data/ndjson/types.ndjson diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs index b4d20c22386da..6a9b7f005d003 100644 --- a/src/query/service/src/table_functions/infer_schema/source.rs +++ b/src/query/service/src/table_functions/infer_schema/source.rs @@ -53,7 +53,7 @@ use opendal::Scheme; use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA; use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed; -const DEFAULT_BYTES: u64 = 20; +const DEFAULT_BYTES: u64 = 1024 * 1024; pub(crate) struct InferSchemaSource { is_finished: bool, @@ -266,7 +266,9 @@ pub async fn read_metadata_async< return Ok(schema); } Err(err) => { - if offset >= file_size { + if offset >= file_size + || !matches!(err, ArrowError::CsvError(_) | ArrowError::JsonError(_)) + { return Err(ErrorCode::from(err)); } } diff --git a/tests/data/csv/types.csv b/tests/data/csv/types.csv new file mode 100644 index 0000000000000..5ff9d1ece820b --- /dev/null +++ b/tests/data/csv/types.csv @@ -0,0 +1,4 @@ +bool_col,int_col,float_col,date_col,ts_sec,ts_ms,ts_us,ts_ns,utf8_col +true,42,3.14,2025-08-20,2025-08-20T12:34:56,2025-08-20T12:34:56.789,2025-08-20T12:34:56.789123,2025-08-20T12:34:56.789123456,hello +false,-7,-2.5,2024-02-29,2024-02-29T00:00:00,2024-02-29T00:00:00.001,2024-02-29T00:00:00.000001,2024-02-29T00:00:00.000000001,world +true,0,0.0,1970-01-01,1970-01-01T00:00:00,1970-01-01T00:00:00.000,1970-01-01T00:00:00.000000,1970-01-01T00:00:00.000000000,"foo,bar" diff --git a/tests/data/ndjson/types.ndjson b/tests/data/ndjson/types.ndjson new file mode 100644 index 0000000000000..d8b7ea5fa004e --- /dev/null +++ b/tests/data/ndjson/types.ndjson @@ -0,0 +1,3 @@ +{"bool_col": true, "int_col": 42, "float_col": 3.14, "date_col": "2025-08-20", "ts_sec": "2025-08-20T12:34:56", "ts_ms": "2025-08-20T12:34:56.789", "ts_us": "2025-08-20T12:34:56.789123", "ts_ns": "2025-08-20T12:34:56.789123456", "utf8_col": "hello"} +{"bool_col": false, "int_col": -7, "float_col": -2.5, "date_col": "2024-02-29", "ts_sec": "2024-02-29T00:00:00", "ts_ms": "2024-02-29T00:00:00.001", "ts_us": "2024-02-29T00:00:00.000001", "ts_ns": "2024-02-29T00:00:00.000000001", "utf8_col": "world"} +{"bool_col": true, "int_col": 0, "float_col": 0.0, "date_col": "1970-01-01", "ts_sec": "1970-01-01T00:00:00", "ts_ms": "1970-01-01T00:00:00.000", "ts_us": "1970-01-01T00:00:00.000000", "ts_ns": "1970-01-01T00:00:00.000000000", "utf8_col": "foo,bar"} diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index 2f8b495f5f9d7..3253a85a76a27 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -110,8 +110,6 @@ select * from infer_schema(location => '@data/csv/max_records.csv.zst', file_for id BIGINT 1 0 value BIGINT 1 1 -# max_records.csv is 71 bytes -# enough bytes query TTBI select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5); ---- @@ -124,6 +122,19 @@ select * from infer_schema(location => '@data/csv/max_records.csv.xz', file_form id BIGINT 1 0 value BIGINT 1 1 +query TTBI +select * from infer_schema(location => '@data/csv/types.csv', file_format => 'head_csv_format') +---- +bool_col BOOLEAN 1 0 +int_col BIGINT 1 1 +float_col DOUBLE 1 2 +date_col DATE 1 3 +ts_sec TIMESTAMP 1 4 +ts_ms TIMESTAMP 1 5 +ts_us TIMESTAMP 1 6 +ts_ns TIMESTAMP 1 7 +utf8_col VARCHAR 1 8 + # NDJSON query TTBI select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON'); @@ -162,8 +173,6 @@ select * from infer_schema(location => '@data/ndjson/max_records.ndjson.zst', fi id BIGINT 1 0 value BIGINT 1 1 -# max_records.csv is 252 bytes -# enough bytes query TTBI select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5); ---- @@ -175,3 +184,16 @@ select * from infer_schema(location => '@data/ndjson/max_records.ndjson.xz', fil ---- id BIGINT 1 0 value BIGINT 1 1 + +query TTBI +select * from infer_schema(location => '@data/ndjson/types.ndjson', file_format => 'NDJSON') +---- +bool_col BOOLEAN 1 0 +int_col BIGINT 1 1 +float_col DOUBLE 1 2 +date_col VARCHAR 1 3 +ts_sec VARCHAR 1 4 +ts_ms VARCHAR 1 5 +ts_us VARCHAR 1 6 +ts_ns VARCHAR 1 7 +utf8_col VARCHAR 1 8 From 9be4b6fb5bacef46a65643c000364c7c16152400 Mon Sep 17 00:00:00 2001 From: kould Date: Thu, 21 Aug 2025 10:25:25 +0800 Subject: [PATCH 10/20] test: add array & object type ndjson test for infer_schema --- tests/data/ndjson/types.ndjson | 6 +++--- .../suites/stage/formats/parquet/infer_schema.test | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/data/ndjson/types.ndjson b/tests/data/ndjson/types.ndjson index d8b7ea5fa004e..99905728103d2 100644 --- a/tests/data/ndjson/types.ndjson +++ b/tests/data/ndjson/types.ndjson @@ -1,3 +1,3 @@ -{"bool_col": true, "int_col": 42, "float_col": 3.14, "date_col": "2025-08-20", "ts_sec": "2025-08-20T12:34:56", "ts_ms": "2025-08-20T12:34:56.789", "ts_us": "2025-08-20T12:34:56.789123", "ts_ns": "2025-08-20T12:34:56.789123456", "utf8_col": "hello"} -{"bool_col": false, "int_col": -7, "float_col": -2.5, "date_col": "2024-02-29", "ts_sec": "2024-02-29T00:00:00", "ts_ms": "2024-02-29T00:00:00.001", "ts_us": "2024-02-29T00:00:00.000001", "ts_ns": "2024-02-29T00:00:00.000000001", "utf8_col": "world"} -{"bool_col": true, "int_col": 0, "float_col": 0.0, "date_col": "1970-01-01", "ts_sec": "1970-01-01T00:00:00", "ts_ms": "1970-01-01T00:00:00.000", "ts_us": "1970-01-01T00:00:00.000000", "ts_ns": "1970-01-01T00:00:00.000000000", "utf8_col": "foo,bar"} +{"bool_col": true, "int_col": 42, "float_col": 3.14, "date_col": "2025-08-20", "ts_sec": "2025-08-20T12:34:56", "ts_ms": "2025-08-20T12:34:56.789", "ts_us": "2025-08-20T12:34:56.789123", "ts_ns": "2025-08-20T12:34:56.789123456", "utf8_col": "hello", "arr_col": [1, 2, 3], "obj_col": {"a": 10, "b": "x"}} +{"bool_col": false, "int_col": -7, "float_col": -2.5, "date_col": "2024-02-29", "ts_sec": "2024-02-29T00:00:00", "ts_ms": "2024-02-29T00:00:00.001", "ts_us": "2024-02-29T00:00:00.000001", "ts_ns": "2024-02-29T00:00:00.000000001", "utf8_col": "world", "arr_col": ["a", "b", "c"], "obj_col": {"a": 20, "b": "y"}} +{"bool_col": true, "int_col": 0, "float_col": 0.0, "date_col": "1970-01-01", "ts_sec": "1970-01-01T00:00:00", "ts_ms": "1970-01-01T00:00:00.000", "ts_us": "1970-01-01T00:00:00.000000", "ts_ns": "1970-01-01T00:00:00.000000000", "utf8_col": "foo,bar", "arr_col": [], "obj_col": {"a": 30, "b": null}} diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index 3253a85a76a27..5ff62697aaf9c 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -197,3 +197,5 @@ ts_ms VARCHAR 1 5 ts_us VARCHAR 1 6 ts_ns VARCHAR 1 7 utf8_col VARCHAR 1 8 +arr_col ARRAY(STRING) 1 9 +obj_col TUPLE(A INT64, B STRING) 1 10 From b2a63276f4a46d4e84782a19d784579704b452e9 Mon Sep 17 00:00:00 2001 From: kould Date: Thu, 21 Aug 2025 13:02:13 +0800 Subject: [PATCH 11/20] chore: add file size check and throw more detailed errors for json --- .../table_functions/infer_schema/source.rs | 57 +++++++++++++++---- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs index 6a9b7f005d003..960e5c278117d 100644 --- a/src/query/service/src/table_functions/infer_schema/source.rs +++ b/src/query/service/src/table_functions/infer_schema/source.rs @@ -19,7 +19,8 @@ use std::io::Cursor; use std::sync::Arc; use arrow_csv::reader::Format; -use arrow_json::reader::infer_json_schema; +use arrow_json::reader::infer_json_schema_from_iterator; +use arrow_json::reader::ValueIter; use arrow_schema::ArrowError; use arrow_schema::Schema as ArrowSchema; use bytes::BufMut; @@ -53,7 +54,9 @@ use opendal::Scheme; use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA; use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed; -const DEFAULT_BYTES: u64 = 1024 * 1024; +const DEFAULT_BYTES: u64 = 10; +const MAX_ZIP_FILE_SIZE: u64 = 20 * 1024 * 1024; +const MAX_COMPRESS_FILE_SIZE: u64 = 100 * 1024 * 1024; pub(crate) struct InferSchemaSource { is_finished: bool, @@ -171,7 +174,9 @@ impl AsyncSource for InferSchemaSource { &operator, Some(first_file.size), self.args_parsed.max_records, - |reader, max_record| format.infer_schema(reader, max_record), + |reader, max_record| { + format.infer_schema(reader, max_record).map_err(Some) + }, ) .await?; TableSchema::try_from(&arrow_schema)? @@ -182,7 +187,23 @@ impl AsyncSource for InferSchemaSource { &operator, Some(first_file.size), self.args_parsed.max_records, - |reader, max_record| infer_json_schema(reader, max_record), + |reader, max_record| { + let mut records = ValueIter::new(reader, max_record); + + let schema = if let Some(max_record) = max_record { + let mut tmp: Vec> = + Vec::with_capacity(max_record); + + for result in records { + tmp.push(Ok(result.map_err(|_| None)?)); + } + infer_json_schema_from_iterator(tmp.into_iter()).map_err(Some)? + } else { + infer_json_schema_from_iterator(&mut records).map_err(Some)? + }; + + Ok((schema, 0)) + }, ) .await?; TableSchema::try_from(&arrow_schema)? @@ -219,7 +240,10 @@ impl AsyncSource for InferSchemaSource { } pub async fn read_metadata_async< - F: Fn(Cursor<&[u8]>, Option) -> std::result::Result<(ArrowSchema, usize), ArrowError>, + F: Fn( + Cursor<&[u8]>, + Option, + ) -> std::result::Result<(ArrowSchema, usize), Option>, >( path: &str, operator: &Operator, @@ -232,6 +256,18 @@ pub async fn read_metadata_async< Some(n) => n, }; let algo = CompressAlgorithm::from_path(path); + let fn_check_data_size = |size: u64| { + if (matches!(algo, Some(CompressAlgorithm::Zip)) && size > MAX_ZIP_FILE_SIZE) + || size > MAX_COMPRESS_FILE_SIZE + { + return Err(ErrorCode::InvalidCompressionData( + "Compression data is too large", + )); + } + Ok(()) + }; + + fn_check_data_size(file_size)?; let mut buf = Vec::new(); let mut offset: u64 = 0; let mut chunk_size: u64 = @@ -259,19 +295,20 @@ pub async fn read_metadata_async< } else { Cow::Borrowed(&buf) }; + fn_check_data_size(bytes.len() as u64)?; if !bytes.is_empty() || offset >= file_size { match func_infer_schema(Cursor::new(bytes.as_slice()), max_records) { Ok((schema, _)) => { return Ok(schema); } - Err(err) => { - if offset >= file_size - || !matches!(err, ArrowError::CsvError(_) | ArrowError::JsonError(_)) - { - return Err(ErrorCode::from(err)); + Err(Some(err)) => { + if matches!(err, ArrowError::CsvError(_)) && offset < file_size { + continue; } + return Err(ErrorCode::from(err)); } + Err(None) => (), } } chunk_size = cmp::min(chunk_size * 2, file_size - offset); From 41b221d1dd5f2942dbcedc6c8e744ce2f23e547f Mon Sep 17 00:00:00 2001 From: kould Date: Thu, 21 Aug 2025 13:15:57 +0800 Subject: [PATCH 12/20] chore: codefmt --- src/query/service/src/table_functions/infer_schema/source.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs index 960e5c278117d..a5b2df832942d 100644 --- a/src/query/service/src/table_functions/infer_schema/source.rs +++ b/src/query/service/src/table_functions/infer_schema/source.rs @@ -174,9 +174,7 @@ impl AsyncSource for InferSchemaSource { &operator, Some(first_file.size), self.args_parsed.max_records, - |reader, max_record| { - format.infer_schema(reader, max_record).map_err(Some) - }, + |reader, max_record| format.infer_schema(reader, max_record).map_err(Some), ) .await?; TableSchema::try_from(&arrow_schema)? From dd452b7746409160614a0a63eb5e50a9bf7cfeb2 Mon Sep 17 00:00:00 2001 From: kould Date: Tue, 26 Aug 2025 17:03:09 +0800 Subject: [PATCH 13/20] feat: Support multiple file scanning for `infer_schema` --- .../table_functions/infer_schema/source.rs | 154 ++++++++---------- tests/data/csv/merge/numbers.csv | 4 + .../csv/merge/numbers_with_last_string.csv | 5 + tests/data/ndjson/merge/numbers.ndjson | 3 + .../merge/numbers_with_last_string.ndjson | 4 + .../stage/formats/parquet/infer_schema.test | 18 +- 6 files changed, 98 insertions(+), 90 deletions(-) create mode 100644 tests/data/csv/merge/numbers.csv create mode 100644 tests/data/csv/merge/numbers_with_last_string.csv create mode 100644 tests/data/ndjson/merge/numbers.ndjson create mode 100644 tests/data/ndjson/merge/numbers_with_last_string.ndjson diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs index a5b2df832942d..c58b654fec3c9 100644 --- a/src/query/service/src/table_functions/infer_schema/source.rs +++ b/src/query/service/src/table_functions/infer_schema/source.rs @@ -22,6 +22,7 @@ use arrow_csv::reader::Format; use arrow_json::reader::infer_json_schema_from_iterator; use arrow_json::reader::ValueIter; use arrow_schema::ArrowError; +use arrow_schema::Schema; use arrow_schema::Schema as ArrowSchema; use bytes::BufMut; use databend_common_ast::ast::FileLocation; @@ -48,6 +49,7 @@ use databend_common_storage::init_stage_operator; use databend_common_storage::read_parquet_schema_async_rs; use databend_common_storage::StageFilesInfo; use databend_common_users::Object; +use futures_util::future::try_join_all; use opendal::Operator; use opendal::Scheme; @@ -55,8 +57,6 @@ use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA; use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed; const DEFAULT_BYTES: u64 = 10; -const MAX_ZIP_FILE_SIZE: u64 = 20 * 1024 * 1024; -const MAX_COMPRESS_FILE_SIZE: u64 = 100 * 1024 * 1024; pub(crate) struct InferSchemaSource { is_finished: bool, @@ -138,86 +138,82 @@ impl AsyncSource for InferSchemaSource { }; let operator = init_stage_operator(&stage_info)?; - let first_file = files_info.first_file(&operator).await?; - let file_format_params = match &self.args_parsed.file_format { - Some(f) => self.ctx.get_file_format(f).await?, - None => stage_info.file_format_params.clone(), - }; - let schema = match (first_file.as_ref(), file_format_params) { - (None, _) => return Ok(None), - (Some(first_file), FileFormatParams::Parquet(_)) => { - let arrow_schema = read_parquet_schema_async_rs( - &operator, - &first_file.path, - Some(first_file.size), - ) - .await?; - TableSchema::try_from(&arrow_schema)? - } - (Some(first_file), FileFormatParams::Csv(params)) => { - let escape = if params.escape.is_empty() { - None - } else { - Some(params.escape.as_bytes()[0]) - }; + let stage_file_infos = files_info.list(&operator, 1, None).await?; + let infer_schema_futures = stage_file_infos.iter().map(|file| async { + let file_format_params = match &self.args_parsed.file_format { + Some(f) => self.ctx.get_file_format(f).await?, + None => stage_info.file_format_params.clone(), + }; + let schema = match file_format_params { + FileFormatParams::Csv(params) => { + let escape = if params.escape.is_empty() { + None + } else { + Some(params.escape.as_bytes()[0]) + }; - let mut format = Format::default() - .with_delimiter(params.field_delimiter.as_bytes()[0]) - .with_quote(params.quote.as_bytes()[0]) - .with_header(params.headers != 0); - if let Some(escape) = escape { - format = format.with_escape(escape); - } + let mut format = Format::default() + .with_delimiter(params.field_delimiter.as_bytes()[0]) + .with_quote(params.quote.as_bytes()[0]) + .with_header(params.headers != 0); + if let Some(escape) = escape { + format = format.with_escape(escape); + } - let arrow_schema = read_metadata_async( - &first_file.path, - &operator, - Some(first_file.size), - self.args_parsed.max_records, - |reader, max_record| format.infer_schema(reader, max_record).map_err(Some), - ) - .await?; - TableSchema::try_from(&arrow_schema)? - } - (Some(first_file), FileFormatParams::NdJson(_)) => { - let arrow_schema = read_metadata_async( - &first_file.path, - &operator, - Some(first_file.size), - self.args_parsed.max_records, - |reader, max_record| { - let mut records = ValueIter::new(reader, max_record); + read_metadata_async( + &file.path, + &operator, + Some(file.size), + self.args_parsed.max_records, + |reader, max_record| format.infer_schema(reader, max_record).map_err(Some), + ) + .await? + } + FileFormatParams::NdJson(_) => { + read_metadata_async( + &file.path, + &operator, + Some(file.size), + self.args_parsed.max_records, + |reader, max_record| { + let mut records = ValueIter::new(reader, max_record); - let schema = if let Some(max_record) = max_record { - let mut tmp: Vec> = - Vec::with_capacity(max_record); + let schema = if let Some(max_record) = max_record { + let mut tmp: Vec> = + Vec::with_capacity(max_record); - for result in records { - tmp.push(Ok(result.map_err(|_| None)?)); - } - infer_json_schema_from_iterator(tmp.into_iter()).map_err(Some)? - } else { - infer_json_schema_from_iterator(&mut records).map_err(Some)? - }; + for result in records { + tmp.push(Ok(result.map_err(|_| None)?)); + } + infer_json_schema_from_iterator(tmp.into_iter()).map_err(Some)? + } else { + infer_json_schema_from_iterator(&mut records).map_err(Some)? + }; - Ok((schema, 0)) - }, - ) - .await?; - TableSchema::try_from(&arrow_schema)? - } - _ => { - return Err(ErrorCode::BadArguments( - "infer_schema is currently limited to format Parquet, CSV and NDJSON", - )); - } - }; + Ok((schema, 0)) + }, + ) + .await? + } + FileFormatParams::Parquet(_) => { + read_parquet_schema_async_rs(&operator, &file.path, Some(file.size)).await? + } + _ => { + return Err(ErrorCode::BadArguments( + "infer_schema is currently limited to format Parquet, CSV and NDJSON", + )); + } + }; + Ok(schema) + }); + let arrow_schema = Schema::try_merge(try_join_all(infer_schema_futures).await?)?; + let table_schema = TableSchema::try_from(&arrow_schema)?; let mut names: Vec = vec![]; let mut types: Vec = vec![]; let mut nulls: Vec = vec![]; - for field in schema.fields().iter() { + for field in table_schema.fields().iter() { names.push(field.name().to_string()); let non_null_type = field.data_type().remove_recursive_nullable(); @@ -225,7 +221,7 @@ impl AsyncSource for InferSchemaSource { nulls.push(field.is_nullable()); } - let order_ids = (0..schema.fields().len() as u64).collect::>(); + let order_ids = (0..table_schema.fields().len() as u64).collect::>(); let block = DataBlock::new_from_columns(vec![ StringType::from_data(names), @@ -254,18 +250,7 @@ pub async fn read_metadata_async< Some(n) => n, }; let algo = CompressAlgorithm::from_path(path); - let fn_check_data_size = |size: u64| { - if (matches!(algo, Some(CompressAlgorithm::Zip)) && size > MAX_ZIP_FILE_SIZE) - || size > MAX_COMPRESS_FILE_SIZE - { - return Err(ErrorCode::InvalidCompressionData( - "Compression data is too large", - )); - } - Ok(()) - }; - fn_check_data_size(file_size)?; let mut buf = Vec::new(); let mut offset: u64 = 0; let mut chunk_size: u64 = @@ -293,7 +278,6 @@ pub async fn read_metadata_async< } else { Cow::Borrowed(&buf) }; - fn_check_data_size(bytes.len() as u64)?; if !bytes.is_empty() || offset >= file_size { match func_infer_schema(Cursor::new(bytes.as_slice()), max_records) { diff --git a/tests/data/csv/merge/numbers.csv b/tests/data/csv/merge/numbers.csv new file mode 100644 index 0000000000000..a49bbf89b1d3d --- /dev/null +++ b/tests/data/csv/merge/numbers.csv @@ -0,0 +1,4 @@ +col1,col2,col3,col4,col5 +0,1,2,3,4 +5,6,7,8,9 +10,11,12,13,14 \ No newline at end of file diff --git a/tests/data/csv/merge/numbers_with_last_string.csv b/tests/data/csv/merge/numbers_with_last_string.csv new file mode 100644 index 0000000000000..d0abce6450294 --- /dev/null +++ b/tests/data/csv/merge/numbers_with_last_string.csv @@ -0,0 +1,5 @@ +col1,col2,col3,col4,col5 +0,1,2,3,4 +5,6,7,8,9 +10,11,12,13,14 +a,b,c,d,e \ No newline at end of file diff --git a/tests/data/ndjson/merge/numbers.ndjson b/tests/data/ndjson/merge/numbers.ndjson new file mode 100644 index 0000000000000..2c39ee429e7e0 --- /dev/null +++ b/tests/data/ndjson/merge/numbers.ndjson @@ -0,0 +1,3 @@ +{"col1":0,"col2":1,"col3":2,"col4":3,"col5":4} +{"col1":5,"col2":6,"col3":7,"col4":8,"col5":9} +{"col1":10,"col2":11,"col3":12,"col4":13,"col5":14} diff --git a/tests/data/ndjson/merge/numbers_with_last_string.ndjson b/tests/data/ndjson/merge/numbers_with_last_string.ndjson new file mode 100644 index 0000000000000..79e6c98910362 --- /dev/null +++ b/tests/data/ndjson/merge/numbers_with_last_string.ndjson @@ -0,0 +1,4 @@ +{"col1":0,"col2":1,"col3":2,"col4":3,"col5":4} +{"col1":5,"col2":6,"col3":7,"col4":8,"col5":9} +{"col1":10,"col2":11,"col3":12,"col4":13,"col5":14} +{"col1":"a","col2":"b","col3":"c","col4":"d","col5":"e"} \ No newline at end of file diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index 5ff62697aaf9c..f5c0cc04546cf 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -61,11 +61,11 @@ drop CONNECTION IF EXISTS my_conn statement ok create CONNECTION my_conn STORAGE_TYPE = 's3' access_key_id='minioadmin' secret_access_key='minioadmin' endpoint_url='http://127.0.0.1:9900/' region='auto' -query -select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn') ----- -id INT 0 0 -t TUPLE(A INT32, B STRING) 0 1 +# query +# select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn') +# ---- +# id INT 0 0 +# t TUPLE(A INT32, B STRING) 0 1 # CSV statement ok @@ -135,6 +135,10 @@ ts_us TIMESTAMP 1 6 ts_ns TIMESTAMP 1 7 utf8_col VARCHAR 1 8 +query TTBI +select * from infer_schema(location => '@data/csv/merge/', file_format => 'head_csv_format'); +---- + # NDJSON query TTBI select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON'); @@ -199,3 +203,7 @@ ts_ns VARCHAR 1 7 utf8_col VARCHAR 1 8 arr_col ARRAY(STRING) 1 9 obj_col TUPLE(A INT64, B STRING) 1 10 + +query TTBI +select * from infer_schema(location => '@data/ndjson/merge/', file_format => 'NDJSON'); +---- From c66aae71049c89d1857da5fa430e906c181680ba Mon Sep 17 00:00:00 2001 From: kould Date: Wed, 3 Sep 2025 14:39:54 +0800 Subject: [PATCH 14/20] refactor: using Pipeline as an implementation of `infer_schema` for CSV and NDJSON --- src/common/storage/src/stage.rs | 2 +- src/meta/app/src/principal/file_format.rs | 24 +- src/meta/app/src/principal/user_stage.rs | 10 +- src/query/ast/src/ast/statements/copy.rs | 2 +- .../infer_schema/infer_schema_table.rs | 189 ++++++++++- .../src/table_functions/infer_schema/mod.rs | 3 +- .../table_functions/infer_schema/parquet.rs | 99 ++++++ .../table_functions/infer_schema/separator.rs | 140 ++++++++ .../table_functions/infer_schema/source.rs | 298 ------------------ src/query/storages/stage/src/infer_schema.rs | 79 +++++ src/query/storages/stage/src/lib.rs | 5 + src/query/storages/stage/src/read/mod.rs | 1 + .../storages/stage/src/read/row_based/mod.rs | 2 + .../{max_records.csv.xz => max_records.xz} | Bin .../{max_records.csv.zst => max_records.zst} | Bin .../{max_records.ndjson.xz => max_records.xz} | Bin ...max_records.ndjson.zst => max_records.zst} | Bin .../stage/formats/parquet/infer_schema.test | 18 +- 18 files changed, 540 insertions(+), 332 deletions(-) create mode 100644 src/query/service/src/table_functions/infer_schema/parquet.rs create mode 100644 src/query/service/src/table_functions/infer_schema/separator.rs delete mode 100644 src/query/service/src/table_functions/infer_schema/source.rs create mode 100644 src/query/storages/stage/src/infer_schema.rs rename tests/data/csv/{max_records.csv.xz => max_records.xz} (100%) rename tests/data/csv/{max_records.csv.zst => max_records.zst} (100%) rename tests/data/ndjson/{max_records.ndjson.xz => max_records.xz} (100%) rename tests/data/ndjson/{max_records.ndjson.zst => max_records.zst} (100%) diff --git a/src/common/storage/src/stage.rs b/src/common/storage/src/stage.rs index 4ce56be4e1f67..6b863ff4e5252 100644 --- a/src/common/storage/src/stage.rs +++ b/src/common/storage/src/stage.rs @@ -98,7 +98,7 @@ pub fn init_stage_operator(stage_info: &StageInfo) -> Result { } /// select * from @s1/ (FILES => PATTERN => ) /// copy from @s1/ FILES = PATTERN => -#[derive(serde::Serialize, serde::Deserialize, Clone, PartialEq, Eq, Debug)] +#[derive(serde::Serialize, serde::Deserialize, Clone, PartialEq, Eq, Debug, Hash)] pub struct StageFilesInfo { pub path: String, pub files: Option>, diff --git a/src/meta/app/src/principal/file_format.rs b/src/meta/app/src/principal/file_format.rs index 19e829c44e2ee..8fc90ce74c79e 100644 --- a/src/meta/app/src/principal/file_format.rs +++ b/src/meta/app/src/principal/file_format.rs @@ -52,7 +52,7 @@ const OPT_BINARY_FORMAT: &str = "binary_format"; const OPT_USE_LOGIC_TYPE: &str = "use_logic_type"; /// File format parameters after checking and parsing. -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(tag = "type")] pub enum FileFormatParams { Csv(CsvFileFormatParams), @@ -446,7 +446,7 @@ impl FileFormatOptionsReader { } } -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct CsvFileFormatParams { pub compression: StageFileCompression, @@ -498,7 +498,7 @@ impl CsvFileFormatParams { } } -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct TsvFileFormatParams { pub compression: StageFileCompression, pub headers: u64, @@ -532,7 +532,7 @@ impl TsvFileFormatParams { } } -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct XmlFileFormatParams { pub compression: StageFileCompression, pub row_tag: String, @@ -558,7 +558,7 @@ impl Default for XmlFileFormatParams { /// used for both `missing_field_as` and `null_field_as` /// for extensibility, it is stored as PB string in meta -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] pub enum NullAs { /// for `missing_field_as` only, and is default for it for safety, /// in case of wrong field names when creating table. @@ -570,7 +570,7 @@ pub enum NullAs { FieldDefault, } -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] pub enum EmptyFieldAs { #[default] Null, @@ -638,7 +638,7 @@ impl Display for NullAs { } } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] pub enum BinaryFormat { #[default] Hex, @@ -668,7 +668,7 @@ impl Display for BinaryFormat { } } -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct JsonFileFormatParams { pub compression: StageFileCompression, } @@ -690,7 +690,7 @@ impl Default for JsonFileFormatParams { } } -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct NdJsonFileFormatParams { pub compression: StageFileCompression, pub missing_field_as: NullAs, @@ -741,7 +741,7 @@ impl NdJsonFileFormatParams { } } -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct AvroFileFormatParams { pub compression: StageFileCompression, pub missing_field_as: NullAs, @@ -791,7 +791,7 @@ impl AvroFileFormatParams { } } -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct ParquetFileFormatParams { // used only for unload pub compression: StageFileCompression, @@ -828,7 +828,7 @@ impl ParquetFileFormatParams { } } -#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct OrcFileFormatParams { pub missing_field_as: NullAs, } diff --git a/src/meta/app/src/principal/user_stage.rs b/src/meta/app/src/principal/user_stage.rs index 92da76b413c07..c2261b288c6d4 100644 --- a/src/meta/app/src/principal/user_stage.rs +++ b/src/meta/app/src/principal/user_stage.rs @@ -60,7 +60,7 @@ pub const COPY_MAX_FILES_PER_COMMIT: usize = 15000; /// Instruction for exceeding 'copy into table' file limit. pub const COPY_MAX_FILES_COMMIT_MSG: &str = "Commit limit reached: 15,000 files for 'copy into table'. To handle more files, adjust 'CopyOption' with 'max_files='(e.g., 'max_files=10000') and perform several operations until all files are processed."; -#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Eq, PartialEq)] +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Eq, PartialEq, Hash)] pub enum StageType { /// LegacyInternal will be deprecated. /// @@ -96,7 +96,7 @@ impl Default for StageType { } } -#[derive(serde::Serialize, serde::Deserialize, Clone, Copy, Debug, Eq, PartialEq)] +#[derive(serde::Serialize, serde::Deserialize, Clone, Copy, Debug, Eq, PartialEq, Hash)] pub enum StageFileCompression { Auto, Gzip, @@ -396,13 +396,13 @@ impl Display for FileFormatOptions { } } -#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq)] +#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq, Hash)] #[serde(default)] pub struct StageParams { pub storage: StorageParams, } -#[derive(serde::Serialize, serde::Deserialize, Clone, Default, Debug, Eq, PartialEq)] +#[derive(serde::Serialize, serde::Deserialize, Clone, Default, Debug, Eq, PartialEq, Hash)] #[serde(default)] pub struct CopyOptions { pub on_error: OnErrorMode, @@ -419,7 +419,7 @@ pub struct CopyOptions { pub detailed_output: bool, } -#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq)] +#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq, Hash)] #[serde(default)] pub struct StageInfo { pub stage_name: String, diff --git a/src/query/ast/src/ast/statements/copy.rs b/src/query/ast/src/ast/statements/copy.rs index 8e10e37318270..7712581b32880 100644 --- a/src/query/ast/src/ast/statements/copy.rs +++ b/src/query/ast/src/ast/statements/copy.rs @@ -648,7 +648,7 @@ impl Display for FileFormatValue { } } -#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq)] +#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq, Hash)] pub enum OnErrorMode { Continue, SkipFileNum(u64), diff --git a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs index 89f37eaf1aa7e..1b90dd139d1d1 100644 --- a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs +++ b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs @@ -13,27 +13,46 @@ // limitations under the License. use std::any::Any; +use std::collections::BTreeMap; use std::sync::Arc; -use databend_common_catalog::plan::DataSourcePlan; +use databend_common_ast::ast::FileLocation; +use databend_common_ast::ast::UriLocation; +use databend_common_catalog::plan::{DataSourcePlan, PartInfo, StageTableInfo}; use databend_common_catalog::plan::PartStatistics; use databend_common_catalog::plan::Partitions; +use databend_common_catalog::plan::PartitionsShuffleKind; use databend_common_catalog::plan::PushDownInfo; use databend_common_catalog::table::Table; use databend_common_catalog::table_args::TableArgs; +use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::types::NumberDataType; -use databend_common_expression::TableDataType; +use databend_common_expression::{BlockThresholds, TableDataType}; use databend_common_expression::TableField; use databend_common_expression::TableSchema; use databend_common_expression::TableSchemaRefExt; +use databend_common_meta_app::principal::FileFormatParams; +use databend_common_meta_app::principal::StageInfo; +use databend_common_meta_app::principal::StageType; use databend_common_meta_app::schema::TableIdent; use databend_common_meta_app::schema::TableInfo; use databend_common_meta_app::schema::TableMeta; use databend_common_pipeline_core::Pipeline; - -use super::source::InferSchemaSource; +use databend_common_pipeline_sources::PrefetchAsyncSourcer; +use databend_common_sql::binder::resolve_file_location; +use databend_common_storage::init_stage_operator; +use databend_common_storage::StageFilesInfo; +use databend_common_storages_stage::{BytesReader, Decompressor, LoadContext}; +use databend_common_storages_stage::InferSchemaPartInfo; +use databend_common_users::Object; +use opendal::Scheme; +use databend_common_compress::CompressAlgorithm; +use databend_common_pipeline_transforms::TransformPipelineHelper; +use databend_storages_common_stage::SingleFilePartition; +use super::parquet::ParquetInferSchemaSource; use crate::sessions::TableContext; +use crate::table_functions::infer_schema::separator::InferSchemaSeparator; use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed; use crate::table_functions::TableFunction; @@ -80,6 +99,23 @@ impl InferSchemaTable { TableField::new("order_id", TableDataType::Number(NumberDataType::UInt64)), ]) } + + fn build_read_stage_source( + ctx: Arc, + pipeline: &mut Pipeline, + stage_info: &StageInfo, + ) -> Result<()> { + let operator = init_stage_operator(stage_info)?; + let batch_size = ctx.get_settings().get_input_read_buffer_size()? as usize; + pipeline.add_source( + |output| { + let reader = BytesReader::try_create(ctx.clone(), operator.clone(), batch_size, 1)?; + PrefetchAsyncSourcer::create(ctx.clone(), output, reader) + }, + 1, + )?; + Ok(()) + } } #[async_trait::async_trait] @@ -95,11 +131,65 @@ impl Table for InferSchemaTable { #[async_backtrace::framed] async fn read_partitions( &self, - _ctx: Arc, + ctx: Arc, _push_downs: Option, _dry_run: bool, ) -> Result<(PartStatistics, Partitions)> { - Ok((PartStatistics::default(), Partitions::default())) + let file_location = if let Some(location) = + self.args_parsed.location.clone().strip_prefix('@') + { + FileLocation::Stage(location.to_string()) + } else if let Some(connection_name) = &self.args_parsed.connection_name { + let conn = ctx.get_connection(connection_name).await?; + let uri = + UriLocation::from_uri(self.args_parsed.location.clone(), conn.storage_params)?; + let proto = conn.storage_type.parse::()?; + if proto != uri.protocol.parse::()? { + return Err(ErrorCode::BadArguments(format!( + "protocol from connection_name={connection_name} ({proto}) not match with uri protocol ({0}).", + uri.protocol + ))); + } + FileLocation::Uri(uri) + } else { + let uri = + UriLocation::from_uri(self.args_parsed.location.clone(), BTreeMap::default())?; + FileLocation::Uri(uri) + }; + let (stage_info, path) = resolve_file_location(ctx.as_ref(), &file_location).await?; + let enable_experimental_rbac_check = + ctx.get_settings().get_enable_experimental_rbac_check()?; + if enable_experimental_rbac_check { + let visibility_checker = ctx.get_visibility_checker(false, Object::Stage).await?; + if !(stage_info.is_temporary + || visibility_checker.check_stage_read_visibility(&stage_info.stage_name) + || stage_info.stage_type == StageType::User + && stage_info.stage_name == ctx.get_current_user()?.name) + { + return Err(ErrorCode::PermissionDenied(format!( + "Permission denied: privilege READ is required on stage {} for user {}", + stage_info.stage_name.clone(), + &ctx.get_current_user()?.identity().display(), + ))); + } + } + let files_info = StageFilesInfo { + path: path.clone(), + ..self.args_parsed.files_info.clone() + }; + + let file_format_params = match &self.args_parsed.file_format { + Some(f) => ctx.get_file_format(f).await?, + None => stage_info.file_format_params.clone(), + }; + let operator = init_stage_operator(&stage_info)?; + let stage_file_infos = files_info.list(&operator, 1, None).await?; + Ok(( + PartStatistics::default(), + Partitions::create(PartitionsShuffleKind::Seq, vec![ + InferSchemaPartInfo::create(files_info, file_format_params, stage_info, stage_file_infos), + ]), + )) } fn table_args(&self) -> Option { @@ -113,10 +203,89 @@ impl Table for InferSchemaTable { pipeline: &mut Pipeline, _put_cache: bool, ) -> Result<()> { - pipeline.add_source( - |output| InferSchemaSource::create(ctx.clone(), output, self.args_parsed.clone()), - 1, - )?; + let Some(part) = ctx.get_partition() else { + return Ok(()); + }; + let info = InferSchemaPartInfo::from_part(&part)?; + + match info.file_format_params { + FileFormatParams::Csv(_) | FileFormatParams::NdJson(_) => { + let partitions = info.stage_file_infos + .iter() + .map(|v| { + let part = SingleFilePartition { + path: v.path.clone(), + size: v.size as usize, + }; + let part_info: Box = Box::new(part); + Arc::new(part_info) + }) + .collect::>(); + ctx.set_partitions(Partitions::create(PartitionsShuffleKind::Seq, partitions))?; + Self::build_read_stage_source(ctx.clone(), pipeline, &info.stage_info)?; + + let stage_table_info = StageTableInfo { + stage_root: "".to_string(), + stage_info: info.stage_info.clone(), + schema: Arc::new(Default::default()), + default_exprs: None, + files_info: info.files_info.clone(), + files_to_copy: None, + duplicated_files_detected: vec![], + is_select: false, + copy_into_table_options: Default::default(), + is_variant: false, + }; + + let load_ctx = Arc::new(LoadContext::try_create_for_copy( + ctx.clone(), + &stage_table_info, + None, + BlockThresholds::default(), + vec![], + )?); + + let mut algo = None; + + for file_info in info.stage_file_infos.iter() { + let Some(new_algo) = CompressAlgorithm::from_path(&file_info.path) else { continue }; + + if let Some(algo) = algo { + if algo != new_algo { + return Err(ErrorCode::UnknownCompressionType("`infer_schema` only supports single compression type")); + } + } + algo = Some(new_algo); + } + if algo.is_some() { + pipeline.try_add_accumulating_transformer(|| { + Decompressor::try_create(load_ctx.clone(), algo) + })?; + } + pipeline.add_accumulating_transformer(|| { + InferSchemaSeparator::create(info.file_format_params.clone(), self.args_parsed.max_records) + }); + } + FileFormatParams::Parquet(_) => { + pipeline.add_source( + |output| { + ParquetInferSchemaSource::create( + ctx.clone(), + output, + info.stage_info.clone(), + info.stage_file_infos.clone(), + ) + }, + 1, + )?; + } + _ => { + return Err(ErrorCode::BadArguments( + "infer_schema is currently limited to format Parquet, CSV and NDJSON", + )); + } + } + Ok(()) } } diff --git a/src/query/service/src/table_functions/infer_schema/mod.rs b/src/query/service/src/table_functions/infer_schema/mod.rs index 7bc1731b442b4..3009bdfa92daa 100644 --- a/src/query/service/src/table_functions/infer_schema/mod.rs +++ b/src/query/service/src/table_functions/infer_schema/mod.rs @@ -13,7 +13,8 @@ // limitations under the License. mod infer_schema_table; -mod source; +mod parquet; mod table_args; +mod separator; pub use infer_schema_table::InferSchemaTable; diff --git a/src/query/service/src/table_functions/infer_schema/parquet.rs b/src/query/service/src/table_functions/infer_schema/parquet.rs new file mode 100644 index 0000000000000..ebe13d80d434b --- /dev/null +++ b/src/query/service/src/table_functions/infer_schema/parquet.rs @@ -0,0 +1,99 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_schema::Schema; +use databend_common_catalog::table_context::TableContext; +use databend_common_exception::Result; +use databend_common_expression::types::BooleanType; +use databend_common_expression::types::StringType; +use databend_common_expression::types::UInt64Type; +use databend_common_expression::DataBlock; +use databend_common_expression::FromData; +use databend_common_expression::TableSchema; +use databend_common_meta_app::principal::StageInfo; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_sources::AsyncSource; +use databend_common_pipeline_sources::AsyncSourcer; +use databend_common_storage::{init_stage_operator, StageFileInfo}; +use databend_common_storage::read_parquet_schema_async_rs; +use futures_util::future::try_join_all; + +use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA; + +pub(crate) struct ParquetInferSchemaSource { + is_finished: bool, + + stage_info: StageInfo, + stage_file_infos: Vec, +} + +impl ParquetInferSchemaSource { + pub fn create( + ctx: Arc, + output: Arc, + stage_info: StageInfo, + stage_file_infos: Vec, + ) -> Result { + AsyncSourcer::create(ctx, output, ParquetInferSchemaSource { + is_finished: false, + stage_info, + stage_file_infos, + }) + } +} + +#[async_trait::async_trait] +impl AsyncSource for ParquetInferSchemaSource { + const NAME: &'static str = INFER_SCHEMA; + + #[async_backtrace::framed] + async fn generate(&mut self) -> Result> { + if self.is_finished { + return Ok(None); + } + self.is_finished = true; + + let operator = init_stage_operator(&self.stage_info)?; + let infer_schema_futures = self.stage_file_infos.iter().map(|file| async { + read_parquet_schema_async_rs(&operator, &file.path, Some(file.size)).await + }); + let arrow_schema = Schema::try_merge(try_join_all(infer_schema_futures).await?)?; + let table_schema = TableSchema::try_from(&arrow_schema)?; + + let mut names: Vec = vec![]; + let mut types: Vec = vec![]; + let mut nulls: Vec = vec![]; + + for field in table_schema.fields().iter() { + names.push(field.name().to_string()); + + let non_null_type = field.data_type().remove_recursive_nullable(); + types.push(non_null_type.sql_name()); + nulls.push(field.is_nullable()); + } + + let order_ids = (0..table_schema.fields().len() as u64).collect::>(); + + let block = DataBlock::new_from_columns(vec![ + StringType::from_data(names), + StringType::from_data(types), + BooleanType::from_data(nulls), + UInt64Type::from_data(order_ids), + ]); + Ok(Some(block)) + } +} diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs new file mode 100644 index 0000000000000..24f838d4f9cb7 --- /dev/null +++ b/src/query/service/src/table_functions/infer_schema/separator.rs @@ -0,0 +1,140 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::io::Cursor; +use arrow_csv::reader::Format; +use arrow_json::reader::{infer_json_schema_from_iterator, ValueIter}; +use arrow_schema::{ArrowError, Schema}; +use databend_common_expression::{BlockMetaInfoDowncast, DataBlock, FromData, TableSchema}; +use databend_common_pipeline_transforms::AccumulatingTransform; +use databend_common_exception::{ErrorCode, Result}; +use databend_common_expression::types::{BooleanType, StringType, UInt64Type}; +use databend_common_meta_app::principal::FileFormatParams; +use databend_common_storages_stage::BytesBatch; + +pub struct InferSchemaSeparator { + pub file_format_params: FileFormatParams, + pub bytes_buf: Vec, + pub max_records: Option, + is_finished: bool, +} + +impl InferSchemaSeparator { + pub fn create(file_format_params: FileFormatParams, max_records: Option) -> Self { + InferSchemaSeparator { + file_format_params, + bytes_buf: vec![], + max_records, + is_finished: false, + } + } +} + +impl AccumulatingTransform for InferSchemaSeparator { + const NAME: &'static str = "InferSchemaSeparator"; + + fn transform(&mut self, data: DataBlock) -> Result> { + if self.is_finished { + return Ok(vec![DataBlock::empty()]); + } + let batch = data + .get_owned_meta() + .and_then(BytesBatch::downcast_from) + .unwrap(); + self.bytes_buf.extend(batch.data); + + // When max_records exists, it will try to use the current bytes to read, otherwise it will buffer all bytes + if self.max_records.is_none() && !batch.is_eof { + return Ok(vec![DataBlock::empty()]); + } + let bytes = Cursor::new(&self.bytes_buf); + let result = match &self.file_format_params { + FileFormatParams::Csv(params) => { + let escape = if params.escape.is_empty() { + None + } else { + Some(params.escape.as_bytes()[0]) + }; + + let mut format = Format::default() + .with_delimiter(params.field_delimiter.as_bytes()[0]) + .with_quote(params.quote.as_bytes()[0]) + .with_header(params.headers != 0); + if let Some(escape) = escape { + format = format.with_escape(escape); + } + format.infer_schema(bytes, self.max_records).map(|(schema, _)| schema).map_err(Some) + } + FileFormatParams::NdJson(_) => { + let mut records = ValueIter::new(bytes, self.max_records); + let fn_ndjson = |max_records| -> std::result::Result> { + if let Some(max_record) = max_records { + let mut tmp: Vec> = + Vec::with_capacity(max_record); + + for result in records { + tmp.push(Ok(result.map_err(|_| None)?)); + } + infer_json_schema_from_iterator(tmp.into_iter()).map_err(Some) + } else { + infer_json_schema_from_iterator(&mut records).map_err(Some) + } + }; + fn_ndjson(self.max_records) + } + _ => { + return Err(ErrorCode::BadArguments( + "InferSchemaSeparator is currently limited to format CSV and NDJSON", + )); + } + }; + let arrow_schema = match result { + Ok(schema) => schema, + Err(None) => { + return Ok(vec![DataBlock::empty()]) + } + Err(Some(err)) => { + if matches!(err, ArrowError::CsvError(_)) && self.max_records.is_some() && !batch.is_eof { + return Ok(vec![DataBlock::empty()]); + } + return Err(err.into()); + } + }; + self.is_finished = true; + + let table_schema = TableSchema::try_from(&arrow_schema)?; + + let mut names: Vec = vec![]; + let mut types: Vec = vec![]; + let mut nulls: Vec = vec![]; + + for field in table_schema.fields().iter() { + names.push(field.name().to_string()); + + let non_null_type = field.data_type().remove_recursive_nullable(); + types.push(non_null_type.sql_name()); + nulls.push(field.is_nullable()); + } + + let order_ids = (0..table_schema.fields().len() as u64).collect::>(); + + let block = DataBlock::new_from_columns(vec![ + StringType::from_data(names), + StringType::from_data(types), + BooleanType::from_data(nulls), + UInt64Type::from_data(order_ids), + ]); + Ok(vec![block]) + } +} \ No newline at end of file diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs deleted file mode 100644 index c58b654fec3c9..0000000000000 --- a/src/query/service/src/table_functions/infer_schema/source.rs +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::borrow::Cow; -use std::cmp; -use std::collections::BTreeMap; -use std::io::Cursor; -use std::sync::Arc; - -use arrow_csv::reader::Format; -use arrow_json::reader::infer_json_schema_from_iterator; -use arrow_json::reader::ValueIter; -use arrow_schema::ArrowError; -use arrow_schema::Schema; -use arrow_schema::Schema as ArrowSchema; -use bytes::BufMut; -use databend_common_ast::ast::FileLocation; -use databend_common_ast::ast::UriLocation; -use databend_common_catalog::table_context::TableContext; -use databend_common_compress::CompressAlgorithm; -use databend_common_compress::DecompressDecoder; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::types::BooleanType; -use databend_common_expression::types::StringType; -use databend_common_expression::types::UInt64Type; -use databend_common_expression::DataBlock; -use databend_common_expression::FromData; -use databend_common_expression::TableSchema; -use databend_common_meta_app::principal::FileFormatParams; -use databend_common_meta_app::principal::StageType; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::ProcessorPtr; -use databend_common_pipeline_sources::AsyncSource; -use databend_common_pipeline_sources::AsyncSourcer; -use databend_common_sql::binder::resolve_file_location; -use databend_common_storage::init_stage_operator; -use databend_common_storage::read_parquet_schema_async_rs; -use databend_common_storage::StageFilesInfo; -use databend_common_users::Object; -use futures_util::future::try_join_all; -use opendal::Operator; -use opendal::Scheme; - -use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA; -use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed; - -const DEFAULT_BYTES: u64 = 10; - -pub(crate) struct InferSchemaSource { - is_finished: bool, - ctx: Arc, - args_parsed: InferSchemaArgsParsed, -} - -impl InferSchemaSource { - pub fn create( - ctx: Arc, - output: Arc, - args_parsed: InferSchemaArgsParsed, - ) -> Result { - AsyncSourcer::create(ctx.clone(), output, InferSchemaSource { - is_finished: false, - ctx, - args_parsed, - }) - } -} - -#[async_trait::async_trait] -impl AsyncSource for InferSchemaSource { - const NAME: &'static str = INFER_SCHEMA; - - #[async_backtrace::framed] - async fn generate(&mut self) -> Result> { - if self.is_finished { - return Ok(None); - } - self.is_finished = true; - - let file_location = if let Some(location) = - self.args_parsed.location.clone().strip_prefix('@') - { - FileLocation::Stage(location.to_string()) - } else if let Some(connection_name) = &self.args_parsed.connection_name { - let conn = self.ctx.get_connection(connection_name).await?; - let uri = - UriLocation::from_uri(self.args_parsed.location.clone(), conn.storage_params)?; - let proto = conn.storage_type.parse::()?; - if proto != uri.protocol.parse::()? { - return Err(ErrorCode::BadArguments(format!( - "protocol from connection_name={connection_name} ({proto}) not match with uri protocol ({0}).", - uri.protocol - ))); - } - FileLocation::Uri(uri) - } else { - let uri = - UriLocation::from_uri(self.args_parsed.location.clone(), BTreeMap::default())?; - FileLocation::Uri(uri) - }; - let (stage_info, path) = resolve_file_location(self.ctx.as_ref(), &file_location).await?; - let enable_experimental_rbac_check = self - .ctx - .get_settings() - .get_enable_experimental_rbac_check()?; - if enable_experimental_rbac_check { - let visibility_checker = self - .ctx - .get_visibility_checker(false, Object::Stage) - .await?; - if !(stage_info.is_temporary - || visibility_checker.check_stage_read_visibility(&stage_info.stage_name) - || stage_info.stage_type == StageType::User - && stage_info.stage_name == self.ctx.get_current_user()?.name) - { - return Err(ErrorCode::PermissionDenied(format!( - "Permission denied: privilege READ is required on stage {} for user {}", - stage_info.stage_name.clone(), - &self.ctx.get_current_user()?.identity().display(), - ))); - } - } - let files_info = StageFilesInfo { - path: path.clone(), - ..self.args_parsed.files_info.clone() - }; - let operator = init_stage_operator(&stage_info)?; - - let stage_file_infos = files_info.list(&operator, 1, None).await?; - let infer_schema_futures = stage_file_infos.iter().map(|file| async { - let file_format_params = match &self.args_parsed.file_format { - Some(f) => self.ctx.get_file_format(f).await?, - None => stage_info.file_format_params.clone(), - }; - let schema = match file_format_params { - FileFormatParams::Csv(params) => { - let escape = if params.escape.is_empty() { - None - } else { - Some(params.escape.as_bytes()[0]) - }; - - let mut format = Format::default() - .with_delimiter(params.field_delimiter.as_bytes()[0]) - .with_quote(params.quote.as_bytes()[0]) - .with_header(params.headers != 0); - if let Some(escape) = escape { - format = format.with_escape(escape); - } - - read_metadata_async( - &file.path, - &operator, - Some(file.size), - self.args_parsed.max_records, - |reader, max_record| format.infer_schema(reader, max_record).map_err(Some), - ) - .await? - } - FileFormatParams::NdJson(_) => { - read_metadata_async( - &file.path, - &operator, - Some(file.size), - self.args_parsed.max_records, - |reader, max_record| { - let mut records = ValueIter::new(reader, max_record); - - let schema = if let Some(max_record) = max_record { - let mut tmp: Vec> = - Vec::with_capacity(max_record); - - for result in records { - tmp.push(Ok(result.map_err(|_| None)?)); - } - infer_json_schema_from_iterator(tmp.into_iter()).map_err(Some)? - } else { - infer_json_schema_from_iterator(&mut records).map_err(Some)? - }; - - Ok((schema, 0)) - }, - ) - .await? - } - FileFormatParams::Parquet(_) => { - read_parquet_schema_async_rs(&operator, &file.path, Some(file.size)).await? - } - _ => { - return Err(ErrorCode::BadArguments( - "infer_schema is currently limited to format Parquet, CSV and NDJSON", - )); - } - }; - Ok(schema) - }); - let arrow_schema = Schema::try_merge(try_join_all(infer_schema_futures).await?)?; - let table_schema = TableSchema::try_from(&arrow_schema)?; - - let mut names: Vec = vec![]; - let mut types: Vec = vec![]; - let mut nulls: Vec = vec![]; - - for field in table_schema.fields().iter() { - names.push(field.name().to_string()); - - let non_null_type = field.data_type().remove_recursive_nullable(); - types.push(non_null_type.sql_name()); - nulls.push(field.is_nullable()); - } - - let order_ids = (0..table_schema.fields().len() as u64).collect::>(); - - let block = DataBlock::new_from_columns(vec![ - StringType::from_data(names), - StringType::from_data(types), - BooleanType::from_data(nulls), - UInt64Type::from_data(order_ids), - ]); - Ok(Some(block)) - } -} - -pub async fn read_metadata_async< - F: Fn( - Cursor<&[u8]>, - Option, - ) -> std::result::Result<(ArrowSchema, usize), Option>, ->( - path: &str, - operator: &Operator, - file_size: Option, - max_records: Option, - func_infer_schema: F, -) -> Result { - let file_size = match file_size { - None => operator.stat(path).await?.content_length(), - Some(n) => n, - }; - let algo = CompressAlgorithm::from_path(path); - - let mut buf = Vec::new(); - let mut offset: u64 = 0; - let mut chunk_size: u64 = - if max_records.is_none() || matches!(algo, Some(CompressAlgorithm::Zip)) { - file_size - } else { - DEFAULT_BYTES - }; - - loop { - let end = cmp::min(offset + chunk_size, file_size); - - let chunk = operator.read_with(path).range(offset..end).await?; - buf.put(chunk); - - offset = end; - - let bytes = if let Some(algo) = algo { - let decompress_bytes = if CompressAlgorithm::Zip == algo { - DecompressDecoder::decompress_all_zip(&buf)? - } else { - DecompressDecoder::new(algo).decompress_batch(&buf)? - }; - Cow::Owned(decompress_bytes) - } else { - Cow::Borrowed(&buf) - }; - - if !bytes.is_empty() || offset >= file_size { - match func_infer_schema(Cursor::new(bytes.as_slice()), max_records) { - Ok((schema, _)) => { - return Ok(schema); - } - Err(Some(err)) => { - if matches!(err, ArrowError::CsvError(_)) && offset < file_size { - continue; - } - return Err(ErrorCode::from(err)); - } - Err(None) => (), - } - } - chunk_size = cmp::min(chunk_size * 2, file_size - offset); - } -} diff --git a/src/query/storages/stage/src/infer_schema.rs b/src/query/storages/stage/src/infer_schema.rs new file mode 100644 index 0000000000000..d986e19ab9736 --- /dev/null +++ b/src/query/storages/stage/src/infer_schema.rs @@ -0,0 +1,79 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::hash::{DefaultHasher, Hash, Hasher}; +use std::sync::Arc; +use databend_common_catalog::plan::{PartInfo, PartInfoPtr, PartInfoType}; +use databend_common_exception::ErrorCode; +use databend_common_meta_app::principal::{FileFormatParams, StageInfo}; +use databend_common_storage::{StageFileInfo, StageFilesInfo}; + +#[derive(serde::Serialize, serde::Deserialize, PartialEq, Eq)] +pub struct InferSchemaPartInfo { + pub files_info: StageFilesInfo, + pub file_format_params: FileFormatParams, + pub stage_info: StageInfo, + pub stage_file_infos: Vec, + +} + +#[typetag::serde(name = "infer_schema")] +impl PartInfo for InferSchemaPartInfo { + fn as_any(&self) -> &dyn Any { + self + } + + fn equals(&self, info: &Box) -> bool { + info.as_any() + .downcast_ref::() + .is_some_and(|other| self == other) + } + + fn hash(&self) -> u64 { + let mut s = DefaultHasher::new(); + self.files_info.hash(&mut s); + self.file_format_params.hash(&mut s); + self.stage_info.hash(&mut s); + s.finish() + } + + fn part_type(&self) -> PartInfoType { + PartInfoType::LazyLevel + } +} + +impl InferSchemaPartInfo { + pub fn create( + files_info: StageFilesInfo, + file_format_params: FileFormatParams, + stage_info: StageInfo, + stage_file_infos: Vec, + ) -> PartInfoPtr { + Arc::new(Box::new(InferSchemaPartInfo { + files_info, + file_format_params, + stage_info, + stage_file_infos, + })) + } + + pub fn from_part(info: &PartInfoPtr) -> databend_common_exception::Result<&InferSchemaPartInfo> { + info.as_any() + .downcast_ref::() + .ok_or_else(|| { + ErrorCode::Internal("Cannot downcast from PartInfo to InferSchemaPartInfo.") + }) + } +} \ No newline at end of file diff --git a/src/query/storages/stage/src/lib.rs b/src/query/storages/stage/src/lib.rs index 96573e42f4af2..e52a1ab93215b 100644 --- a/src/query/storages/stage/src/lib.rs +++ b/src/query/storages/stage/src/lib.rs @@ -26,6 +26,7 @@ mod stage_table; mod streaming_load; mod transform_generating; mod transform_null_if; +mod infer_schema; pub use append::StageSinkTable; pub use compression::get_compression_with_path; @@ -33,3 +34,7 @@ pub use read::row_based::BytesBatch; pub use stage_table::StageTable; pub use streaming_load::build_streaming_load_pipeline; pub use transform_null_if::TransformNullIf; +pub use infer_schema::InferSchemaPartInfo; +pub use read::row_based::BytesReader; +pub use read::row_based::Decompressor; +pub use read::LoadContext; diff --git a/src/query/storages/stage/src/read/mod.rs b/src/query/storages/stage/src/read/mod.rs index 98f677c51fbde..984e6c62f78a5 100644 --- a/src/query/storages/stage/src/read/mod.rs +++ b/src/query/storages/stage/src/read/mod.rs @@ -20,3 +20,4 @@ pub mod row_based; pub(crate) mod block_builder_state; mod default_expr_evaluator; pub(crate) mod whole_file_reader; +pub use load_context::LoadContext; diff --git a/src/query/storages/stage/src/read/row_based/mod.rs b/src/query/storages/stage/src/read/row_based/mod.rs index a630ae43fabfc..409236777b951 100644 --- a/src/query/storages/stage/src/read/row_based/mod.rs +++ b/src/query/storages/stage/src/read/row_based/mod.rs @@ -21,3 +21,5 @@ mod utils; pub use batch::BytesBatch; pub use read_pipeline::RowBasedReadPipelineBuilder; +pub use processors::BytesReader; +pub use processors::Decompressor; diff --git a/tests/data/csv/max_records.csv.xz b/tests/data/csv/max_records.xz similarity index 100% rename from tests/data/csv/max_records.csv.xz rename to tests/data/csv/max_records.xz diff --git a/tests/data/csv/max_records.csv.zst b/tests/data/csv/max_records.zst similarity index 100% rename from tests/data/csv/max_records.csv.zst rename to tests/data/csv/max_records.zst diff --git a/tests/data/ndjson/max_records.ndjson.xz b/tests/data/ndjson/max_records.xz similarity index 100% rename from tests/data/ndjson/max_records.ndjson.xz rename to tests/data/ndjson/max_records.xz diff --git a/tests/data/ndjson/max_records.ndjson.zst b/tests/data/ndjson/max_records.zst similarity index 100% rename from tests/data/ndjson/max_records.ndjson.zst rename to tests/data/ndjson/max_records.zst diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index f5c0cc04546cf..2629bd5a0f351 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -105,7 +105,7 @@ id BIGINT 1 0 value BIGINT 1 1 query TTBI -select * from infer_schema(location => '@data/csv/max_records.csv.zst', file_format => 'head_csv_format', max_records_pre_file => 5); +select * from infer_schema(location => '@data/csv/max_records.zst', file_format => 'head_csv_format', max_records_pre_file => 5); ---- id BIGINT 1 0 value BIGINT 1 1 @@ -117,7 +117,7 @@ id BIGINT 1 0 value BIGINT 1 1 query TTBI -select * from infer_schema(location => '@data/csv/max_records.csv.xz', file_format => 'head_csv_format', max_records_pre_file => 5); +select * from infer_schema(location => '@data/csv/max_records.xz', file_format => 'head_csv_format', max_records_pre_file => 5); ---- id BIGINT 1 0 value BIGINT 1 1 @@ -138,6 +138,11 @@ utf8_col VARCHAR 1 8 query TTBI select * from infer_schema(location => '@data/csv/merge/', file_format => 'head_csv_format'); ---- +col1 VARCHAR 1 0 +col2 VARCHAR 1 1 +col3 VARCHAR 1 2 +col4 VARCHAR 1 3 +col5 VARCHAR 1 4 # NDJSON query TTBI @@ -172,7 +177,7 @@ id BIGINT 1 0 value BIGINT 1 1 query TTBI -select * from infer_schema(location => '@data/ndjson/max_records.ndjson.zst', file_format => 'NDJSON', max_records_pre_file => 5); +select * from infer_schema(location => '@data/ndjson/max_records.zst', file_format => 'NDJSON', max_records_pre_file => 5); ---- id BIGINT 1 0 value BIGINT 1 1 @@ -184,7 +189,7 @@ id BIGINT 1 0 value BIGINT 1 1 query TTBI -select * from infer_schema(location => '@data/ndjson/max_records.ndjson.xz', file_format => 'NDJSON', max_records_pre_file => 5) +select * from infer_schema(location => '@data/ndjson/max_records.xz', file_format => 'NDJSON', max_records_pre_file => 5) ---- id BIGINT 1 0 value BIGINT 1 1 @@ -207,3 +212,8 @@ obj_col TUPLE(A INT64, B STRING) 1 10 query TTBI select * from infer_schema(location => '@data/ndjson/merge/', file_format => 'NDJSON'); ---- +col1 VARCHAR 1 0 +col2 VARCHAR 1 1 +col3 VARCHAR 1 2 +col4 VARCHAR 1 3 +col5 VARCHAR 1 4 From 178aacffd322859108e430de82dd0fe5c3bc16e1 Mon Sep 17 00:00:00 2001 From: kould Date: Thu, 4 Sep 2025 17:53:52 +0800 Subject: [PATCH 15/20] feat: InferSeparator multi-file processing and Schema promote merging type --- src/query/ast/src/ast/statements/copy.rs | 4 +- .../infer_schema/infer_schema_table.rs | 42 ++- .../src/table_functions/infer_schema/merge.rs | 284 ++++++++++++++++++ .../src/table_functions/infer_schema/mod.rs | 3 +- .../table_functions/infer_schema/parquet.rs | 4 +- .../table_functions/infer_schema/separator.rs | 80 +++-- src/query/storages/stage/src/infer_schema.rs | 22 +- src/query/storages/stage/src/lib.rs | 10 +- .../storages/stage/src/read/row_based/mod.rs | 2 +- .../stage/formats/parquet/infer_schema.test | 10 +- 10 files changed, 410 insertions(+), 51 deletions(-) create mode 100644 src/query/service/src/table_functions/infer_schema/merge.rs diff --git a/src/query/ast/src/ast/statements/copy.rs b/src/query/ast/src/ast/statements/copy.rs index 7712581b32880..b207b05d7d879 100644 --- a/src/query/ast/src/ast/statements/copy.rs +++ b/src/query/ast/src/ast/statements/copy.rs @@ -648,7 +648,9 @@ impl Display for FileFormatValue { } } -#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq, Hash)] +#[derive( + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq, Hash, +)] pub enum OnErrorMode { Continue, SkipFileNum(u64), diff --git a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs index 1b90dd139d1d1..fd4ae9c947fcd 100644 --- a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs +++ b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs @@ -18,17 +18,21 @@ use std::sync::Arc; use databend_common_ast::ast::FileLocation; use databend_common_ast::ast::UriLocation; -use databend_common_catalog::plan::{DataSourcePlan, PartInfo, StageTableInfo}; +use databend_common_catalog::plan::DataSourcePlan; +use databend_common_catalog::plan::PartInfo; use databend_common_catalog::plan::PartStatistics; use databend_common_catalog::plan::Partitions; use databend_common_catalog::plan::PartitionsShuffleKind; use databend_common_catalog::plan::PushDownInfo; +use databend_common_catalog::plan::StageTableInfo; use databend_common_catalog::table::Table; use databend_common_catalog::table_args::TableArgs; +use databend_common_compress::CompressAlgorithm; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::types::NumberDataType; -use databend_common_expression::{BlockThresholds, TableDataType}; +use databend_common_expression::BlockThresholds; +use databend_common_expression::TableDataType; use databend_common_expression::TableField; use databend_common_expression::TableSchema; use databend_common_expression::TableSchemaRefExt; @@ -40,16 +44,18 @@ use databend_common_meta_app::schema::TableInfo; use databend_common_meta_app::schema::TableMeta; use databend_common_pipeline_core::Pipeline; use databend_common_pipeline_sources::PrefetchAsyncSourcer; +use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_sql::binder::resolve_file_location; use databend_common_storage::init_stage_operator; use databend_common_storage::StageFilesInfo; -use databend_common_storages_stage::{BytesReader, Decompressor, LoadContext}; +use databend_common_storages_stage::BytesReader; +use databend_common_storages_stage::Decompressor; use databend_common_storages_stage::InferSchemaPartInfo; +use databend_common_storages_stage::LoadContext; use databend_common_users::Object; -use opendal::Scheme; -use databend_common_compress::CompressAlgorithm; -use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_storages_common_stage::SingleFilePartition; +use opendal::Scheme; + use super::parquet::ParquetInferSchemaSource; use crate::sessions::TableContext; use crate::table_functions::infer_schema::separator::InferSchemaSeparator; @@ -187,7 +193,12 @@ impl Table for InferSchemaTable { Ok(( PartStatistics::default(), Partitions::create(PartitionsShuffleKind::Seq, vec![ - InferSchemaPartInfo::create(files_info, file_format_params, stage_info, stage_file_infos), + InferSchemaPartInfo::create( + files_info, + file_format_params, + stage_info, + stage_file_infos, + ), ]), )) } @@ -210,7 +221,8 @@ impl Table for InferSchemaTable { match info.file_format_params { FileFormatParams::Csv(_) | FileFormatParams::NdJson(_) => { - let partitions = info.stage_file_infos + let partitions = info + .stage_file_infos .iter() .map(|v| { let part = SingleFilePartition { @@ -248,11 +260,15 @@ impl Table for InferSchemaTable { let mut algo = None; for file_info in info.stage_file_infos.iter() { - let Some(new_algo) = CompressAlgorithm::from_path(&file_info.path) else { continue }; + let Some(new_algo) = CompressAlgorithm::from_path(&file_info.path) else { + continue; + }; if let Some(algo) = algo { if algo != new_algo { - return Err(ErrorCode::UnknownCompressionType("`infer_schema` only supports single compression type")); + return Err(ErrorCode::UnknownCompressionType( + "`infer_schema` only supports single compression type", + )); } } algo = Some(new_algo); @@ -263,7 +279,11 @@ impl Table for InferSchemaTable { })?; } pipeline.add_accumulating_transformer(|| { - InferSchemaSeparator::create(info.file_format_params.clone(), self.args_parsed.max_records) + InferSchemaSeparator::create( + info.file_format_params.clone(), + self.args_parsed.max_records, + info.stage_file_infos.len(), + ) }); } FileFormatParams::Parquet(_) => { diff --git a/src/query/service/src/table_functions/infer_schema/merge.rs b/src/query/service/src/table_functions/infer_schema/merge.rs new file mode 100644 index 0000000000000..1b441f6a68b24 --- /dev/null +++ b/src/query/service/src/table_functions/infer_schema/merge.rs @@ -0,0 +1,284 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_expression::types::NumberDataType; +use databend_common_expression::TableDataType; +use databend_common_expression::TableSchema; + +const UNSIGNED_TYPES: [NumberDataType; 4] = [ + NumberDataType::UInt8, + NumberDataType::UInt16, + NumberDataType::UInt32, + NumberDataType::UInt64, +]; +const SIGNED_TYPES: [NumberDataType; 4] = [ + NumberDataType::Int8, + NumberDataType::Int16, + NumberDataType::Int32, + NumberDataType::Int64, +]; +const FLOAT_TYPES: [NumberDataType; 2] = [NumberDataType::Float32, NumberDataType::Float64]; + +fn wrap_nullable(ty: TableDataType, is_nullable: bool) -> TableDataType { + if is_nullable { + ty.wrap_nullable() + } else { + ty + } +} + +pub fn merge_type( + old: TableDataType, + new: TableDataType, + is_nullable: bool, +) -> Option { + if old.remove_nullable() == new.remove_nullable() { + return Some(wrap_nullable(old, is_nullable)); + } + if let (TableDataType::Number(old_num), TableDataType::Number(new_num)) = + (new.remove_nullable(), old.remove_nullable()) + { + if old_num.is_float() && new_num.is_float() { + return promote_numeric(&old, &new, &FLOAT_TYPES) + .map(|ty| wrap_nullable(ty, is_nullable)); + } + return promote_numeric(&old, &new, &SIGNED_TYPES) + .or_else(|| promote_numeric(&old, &new, &UNSIGNED_TYPES)) + .map(|ty| wrap_nullable(ty, is_nullable)); + } + None +} + +pub fn promote_numeric( + a: &TableDataType, + b: &TableDataType, + types: &[NumberDataType], +) -> Option { + let idx_a = match a { + TableDataType::Number(n) => types.iter().position(|t| t == n), + _ => None, + }; + let idx_b = match b { + TableDataType::Number(n) => types.iter().position(|t| t == n), + _ => None, + }; + match (idx_a, idx_b) { + (Some(i), Some(j)) => Some(TableDataType::Number(types[usize::max(i, j)].clone())), + _ => None, + } +} + +pub fn merge_schema(defined: TableSchema, guess: TableSchema) -> TableSchema { + let TableSchema { + fields: mut def_fields, + .. + } = defined; + let TableSchema { + fields: guess_fields, + .. + } = guess; + + for guess_field in guess_fields { + match def_fields + .iter_mut() + .find(|def_field| def_field.name() == guess_field.name()) + { + None => { + def_fields.push(guess_field); + } + Some(def_field) => { + let is_nullable = + def_field.data_type.is_nullable() || guess_field.data_type.is_nullable(); + def_field.data_type = merge_type( + def_field.data_type.clone(), + guess_field.data_type, + is_nullable, + ) + .unwrap_or_else(|| wrap_nullable(TableDataType::String, is_nullable)); + } + } + } + + TableSchema::new(def_fields) +} + +#[cfg(test)] +mod tests { + use databend_common_expression::types::NumberDataType; + use databend_common_expression::TableDataType; + use databend_common_expression::TableField; + use databend_common_expression::TableSchema; + + use crate::table_functions::infer_schema::merge::merge_schema; + use crate::table_functions::infer_schema::merge::merge_type; + + #[test] + fn test_promote_unsigned() { + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::UInt8), + TableDataType::Number(NumberDataType::UInt16), + false, + ), + Some(TableDataType::Number(NumberDataType::UInt16)) + ); + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::UInt32), + TableDataType::Number(NumberDataType::UInt64), + false, + ), + Some(TableDataType::Number(NumberDataType::UInt64)) + ); + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::UInt8), + TableDataType::Number(NumberDataType::Int8), + false, + ), + None + ); + } + + #[test] + fn test_promote_signed() { + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::Int8), + TableDataType::Number(NumberDataType::Int16), + false, + ), + Some(TableDataType::Number(NumberDataType::Int16)) + ); + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::Int32), + TableDataType::Number(NumberDataType::Int64), + false, + ), + Some(TableDataType::Number(NumberDataType::Int64)) + ); + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::Int8), + TableDataType::Number(NumberDataType::UInt8), + false, + ), + None + ); + } + + #[test] + fn test_promote_integer() { + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::Int8), + TableDataType::Number(NumberDataType::Int16), + false, + ), + Some(TableDataType::Number(NumberDataType::Int16)) + ); + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::UInt8), + TableDataType::Number(NumberDataType::UInt32), + false, + ), + Some(TableDataType::Number(NumberDataType::UInt32)) + ); + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::Int8), + TableDataType::Number(NumberDataType::UInt8), + false, + ), + None + ); + } + + #[test] + fn test_promote_float() { + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::Float32), + TableDataType::Number(NumberDataType::Float64), + false, + ), + Some(TableDataType::Number(NumberDataType::Float64)) + ); + } + + #[test] + fn test_promote_numeric() { + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::Int8), + TableDataType::Number(NumberDataType::Int16), + false, + ), + Some(TableDataType::Number(NumberDataType::Int16)) + ); + assert_eq!( + merge_type( + TableDataType::Number(NumberDataType::Float32), + TableDataType::Number(NumberDataType::Int16), + false, + ), + None + ); + assert_eq!( + merge_type( + TableDataType::String, + TableDataType::Number(NumberDataType::Int32), + false, + ), + None + ); + } + + #[test] + fn test_merge_schema() { + let schema_1 = TableSchema::new(vec![ + TableField::new( + "c1", + TableDataType::Nullable(Box::new(TableDataType::Number(NumberDataType::Int8))), + ), + TableField::new("c2", TableDataType::Number(NumberDataType::Int8)), + TableField::new("c3", TableDataType::Number(NumberDataType::Int32)), + TableField::new("c4", TableDataType::Number(NumberDataType::Float32)), + TableField::new("c5", TableDataType::Number(NumberDataType::Float32)), + ]); + let schema_2 = TableSchema::new(vec![ + TableField::new("c1", TableDataType::Number(NumberDataType::Int8)), + TableField::new("c3", TableDataType::Number(NumberDataType::Float32)), + TableField::new("c2", TableDataType::Number(NumberDataType::Int8)), + TableField::new("c4", TableDataType::Number(NumberDataType::Float32)), + TableField::new("c6", TableDataType::Number(NumberDataType::Float32)), + ]); + + let schema = merge_schema(schema_1, schema_2); + let expected_schema = TableSchema::new(vec![ + TableField::new( + "c1", + TableDataType::Nullable(Box::new(TableDataType::Number(NumberDataType::Int8))), + ), + TableField::new("c2", TableDataType::Number(NumberDataType::Int8)), + TableField::new("c3", TableDataType::String), + TableField::new("c4", TableDataType::Number(NumberDataType::Float32)), + TableField::new("c5", TableDataType::Number(NumberDataType::Float32)), + TableField::new("c6", TableDataType::Number(NumberDataType::Float32)), + ]); + assert_eq!(schema, expected_schema); + } +} diff --git a/src/query/service/src/table_functions/infer_schema/mod.rs b/src/query/service/src/table_functions/infer_schema/mod.rs index 3009bdfa92daa..82597e959806b 100644 --- a/src/query/service/src/table_functions/infer_schema/mod.rs +++ b/src/query/service/src/table_functions/infer_schema/mod.rs @@ -13,8 +13,9 @@ // limitations under the License. mod infer_schema_table; +mod merge; mod parquet; -mod table_args; mod separator; +mod table_args; pub use infer_schema_table::InferSchemaTable; diff --git a/src/query/service/src/table_functions/infer_schema/parquet.rs b/src/query/service/src/table_functions/infer_schema/parquet.rs index ebe13d80d434b..38e15819d0eb7 100644 --- a/src/query/service/src/table_functions/infer_schema/parquet.rs +++ b/src/query/service/src/table_functions/infer_schema/parquet.rs @@ -28,8 +28,9 @@ use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_sources::AsyncSource; use databend_common_pipeline_sources::AsyncSourcer; -use databend_common_storage::{init_stage_operator, StageFileInfo}; +use databend_common_storage::init_stage_operator; use databend_common_storage::read_parquet_schema_async_rs; +use databend_common_storage::StageFileInfo; use futures_util::future::try_join_all; use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA; @@ -71,6 +72,7 @@ impl AsyncSource for ParquetInferSchemaSource { let infer_schema_futures = self.stage_file_infos.iter().map(|file| async { read_parquet_schema_async_rs(&operator, &file.path, Some(file.size)).await }); + // todo: unify_schemas(arrow-rs unsupported now) let arrow_schema = Schema::try_merge(try_join_all(infer_schema_futures).await?)?; let table_schema = TableSchema::try_from(&arrow_schema)?; diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs index 24f838d4f9cb7..68335bf4a930c 100644 --- a/src/query/service/src/table_functions/infer_schema/separator.rs +++ b/src/query/service/src/table_functions/infer_schema/separator.rs @@ -12,30 +12,50 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::io::Cursor; + use arrow_csv::reader::Format; -use arrow_json::reader::{infer_json_schema_from_iterator, ValueIter}; -use arrow_schema::{ArrowError, Schema}; -use databend_common_expression::{BlockMetaInfoDowncast, DataBlock, FromData, TableSchema}; -use databend_common_pipeline_transforms::AccumulatingTransform; -use databend_common_exception::{ErrorCode, Result}; -use databend_common_expression::types::{BooleanType, StringType, UInt64Type}; +use arrow_json::reader::infer_json_schema_from_iterator; +use arrow_json::reader::ValueIter; +use arrow_schema::ArrowError; +use arrow_schema::Schema; +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use databend_common_expression::types::BooleanType; +use databend_common_expression::types::StringType; +use databend_common_expression::types::UInt64Type; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_expression::FromData; +use databend_common_expression::TableSchema; use databend_common_meta_app::principal::FileFormatParams; +use databend_common_pipeline_transforms::AccumulatingTransform; use databend_common_storages_stage::BytesBatch; +use crate::table_functions::infer_schema::merge::merge_schema; + pub struct InferSchemaSeparator { pub file_format_params: FileFormatParams, - pub bytes_buf: Vec, + files: HashMap>, pub max_records: Option, + schemas: Vec, + files_len: usize, is_finished: bool, } impl InferSchemaSeparator { - pub fn create(file_format_params: FileFormatParams, max_records: Option) -> Self { + pub fn create( + file_format_params: FileFormatParams, + max_records: Option, + files_len: usize, + ) -> Self { InferSchemaSeparator { file_format_params, - bytes_buf: vec![], + files: HashMap::new(), max_records, + schemas: Vec::with_capacity(files_len), + files_len, is_finished: false, } } @@ -52,13 +72,15 @@ impl AccumulatingTransform for InferSchemaSeparator { .get_owned_meta() .and_then(BytesBatch::downcast_from) .unwrap(); - self.bytes_buf.extend(batch.data); + + let bytes = self.files.entry(batch.path.clone()).or_insert(Vec::new()); + bytes.extend(batch.data); // When max_records exists, it will try to use the current bytes to read, otherwise it will buffer all bytes if self.max_records.is_none() && !batch.is_eof { return Ok(vec![DataBlock::empty()]); } - let bytes = Cursor::new(&self.bytes_buf); + let bytes = Cursor::new(bytes); let result = match &self.file_format_params { FileFormatParams::Csv(params) => { let escape = if params.escape.is_empty() { @@ -74,7 +96,10 @@ impl AccumulatingTransform for InferSchemaSeparator { if let Some(escape) = escape { format = format.with_escape(escape); } - format.infer_schema(bytes, self.max_records).map(|(schema, _)| schema).map_err(Some) + format + .infer_schema(bytes, self.max_records) + .map(|(schema, _)| schema) + .map_err(Some) } FileFormatParams::NdJson(_) => { let mut records = ValueIter::new(bytes, self.max_records); @@ -101,19 +126,36 @@ impl AccumulatingTransform for InferSchemaSeparator { }; let arrow_schema = match result { Ok(schema) => schema, - Err(None) => { - return Ok(vec![DataBlock::empty()]) - } + Err(None) => return Ok(vec![DataBlock::empty()]), Err(Some(err)) => { - if matches!(err, ArrowError::CsvError(_)) && self.max_records.is_some() && !batch.is_eof { + if matches!(err, ArrowError::CsvError(_)) + && self.max_records.is_some() + && !batch.is_eof + { return Ok(vec![DataBlock::empty()]); } return Err(err.into()); } }; - self.is_finished = true; + self.files.remove(&batch.path); + self.schemas.push(arrow_schema); - let table_schema = TableSchema::try_from(&arrow_schema)?; + if self.schemas.len() != self.files_len { + return Ok(vec![DataBlock::empty()]); + } + self.is_finished = true; + if self.schemas.len() == 0 { + return Ok(vec![DataBlock::empty()]); + } + let table_schema = if self.schemas.len() == 1 { + TableSchema::try_from(&self.schemas.pop().unwrap())? + } else { + self.schemas[1..] + .iter() + .try_fold(TableSchema::try_from(&self.schemas[0])?, |acc, schema| { + TableSchema::try_from(schema).map(|schema| merge_schema(acc, schema)) + })? + }; let mut names: Vec = vec![]; let mut types: Vec = vec![]; @@ -137,4 +179,4 @@ impl AccumulatingTransform for InferSchemaSeparator { ]); Ok(vec![block]) } -} \ No newline at end of file +} diff --git a/src/query/storages/stage/src/infer_schema.rs b/src/query/storages/stage/src/infer_schema.rs index d986e19ab9736..5a3c25a6c4910 100644 --- a/src/query/storages/stage/src/infer_schema.rs +++ b/src/query/storages/stage/src/infer_schema.rs @@ -13,12 +13,19 @@ // limitations under the License. use std::any::Any; -use std::hash::{DefaultHasher, Hash, Hasher}; +use std::hash::DefaultHasher; +use std::hash::Hash; +use std::hash::Hasher; use std::sync::Arc; -use databend_common_catalog::plan::{PartInfo, PartInfoPtr, PartInfoType}; + +use databend_common_catalog::plan::PartInfo; +use databend_common_catalog::plan::PartInfoPtr; +use databend_common_catalog::plan::PartInfoType; use databend_common_exception::ErrorCode; -use databend_common_meta_app::principal::{FileFormatParams, StageInfo}; -use databend_common_storage::{StageFileInfo, StageFilesInfo}; +use databend_common_meta_app::principal::FileFormatParams; +use databend_common_meta_app::principal::StageInfo; +use databend_common_storage::StageFileInfo; +use databend_common_storage::StageFilesInfo; #[derive(serde::Serialize, serde::Deserialize, PartialEq, Eq)] pub struct InferSchemaPartInfo { @@ -26,7 +33,6 @@ pub struct InferSchemaPartInfo { pub file_format_params: FileFormatParams, pub stage_info: StageInfo, pub stage_file_infos: Vec, - } #[typetag::serde(name = "infer_schema")] @@ -69,11 +75,13 @@ impl InferSchemaPartInfo { })) } - pub fn from_part(info: &PartInfoPtr) -> databend_common_exception::Result<&InferSchemaPartInfo> { + pub fn from_part( + info: &PartInfoPtr, + ) -> databend_common_exception::Result<&InferSchemaPartInfo> { info.as_any() .downcast_ref::() .ok_or_else(|| { ErrorCode::Internal("Cannot downcast from PartInfo to InferSchemaPartInfo.") }) } -} \ No newline at end of file +} diff --git a/src/query/storages/stage/src/lib.rs b/src/query/storages/stage/src/lib.rs index e52a1ab93215b..39e7392165464 100644 --- a/src/query/storages/stage/src/lib.rs +++ b/src/query/storages/stage/src/lib.rs @@ -21,20 +21,20 @@ mod append; mod compression; +mod infer_schema; mod read; mod stage_table; mod streaming_load; mod transform_generating; mod transform_null_if; -mod infer_schema; pub use append::StageSinkTable; pub use compression::get_compression_with_path; -pub use read::row_based::BytesBatch; -pub use stage_table::StageTable; -pub use streaming_load::build_streaming_load_pipeline; -pub use transform_null_if::TransformNullIf; pub use infer_schema::InferSchemaPartInfo; +pub use read::row_based::BytesBatch; pub use read::row_based::BytesReader; pub use read::row_based::Decompressor; pub use read::LoadContext; +pub use stage_table::StageTable; +pub use streaming_load::build_streaming_load_pipeline; +pub use transform_null_if::TransformNullIf; diff --git a/src/query/storages/stage/src/read/row_based/mod.rs b/src/query/storages/stage/src/read/row_based/mod.rs index 409236777b951..69e24b53db07a 100644 --- a/src/query/storages/stage/src/read/row_based/mod.rs +++ b/src/query/storages/stage/src/read/row_based/mod.rs @@ -20,6 +20,6 @@ mod read_pipeline; mod utils; pub use batch::BytesBatch; -pub use read_pipeline::RowBasedReadPipelineBuilder; pub use processors::BytesReader; pub use processors::Decompressor; +pub use read_pipeline::RowBasedReadPipelineBuilder; diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index 2629bd5a0f351..558617d91adc0 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -61,11 +61,11 @@ drop CONNECTION IF EXISTS my_conn statement ok create CONNECTION my_conn STORAGE_TYPE = 's3' access_key_id='minioadmin' secret_access_key='minioadmin' endpoint_url='http://127.0.0.1:9900/' region='auto' -# query -# select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn') -# ---- -# id INT 0 0 -# t TUPLE(A INT32, B STRING) 0 1 +query +select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn') +---- +id INT 0 0 +t TUPLE(A INT32, B STRING) 0 1 # CSV statement ok From 4bd26e5476439065abe4ac1e972457a3539729cb Mon Sep 17 00:00:00 2001 From: kould Date: Thu, 4 Sep 2025 23:24:29 +0800 Subject: [PATCH 16/20] chore: codefmt --- src/query/service/src/table_functions/infer_schema/merge.rs | 2 +- .../service/src/table_functions/infer_schema/separator.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/query/service/src/table_functions/infer_schema/merge.rs b/src/query/service/src/table_functions/infer_schema/merge.rs index 1b441f6a68b24..5aa78e263a7cb 100644 --- a/src/query/service/src/table_functions/infer_schema/merge.rs +++ b/src/query/service/src/table_functions/infer_schema/merge.rs @@ -74,7 +74,7 @@ pub fn promote_numeric( _ => None, }; match (idx_a, idx_b) { - (Some(i), Some(j)) => Some(TableDataType::Number(types[usize::max(i, j)].clone())), + (Some(i), Some(j)) => Some(TableDataType::Number(types[usize::max(i, j)])), _ => None, } } diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs index 68335bf4a930c..a485d72356003 100644 --- a/src/query/service/src/table_functions/infer_schema/separator.rs +++ b/src/query/service/src/table_functions/infer_schema/separator.rs @@ -73,7 +73,7 @@ impl AccumulatingTransform for InferSchemaSeparator { .and_then(BytesBatch::downcast_from) .unwrap(); - let bytes = self.files.entry(batch.path.clone()).or_insert(Vec::new()); + let bytes = self.files.entry(batch.path.clone()).or_default(); bytes.extend(batch.data); // When max_records exists, it will try to use the current bytes to read, otherwise it will buffer all bytes @@ -144,7 +144,7 @@ impl AccumulatingTransform for InferSchemaSeparator { return Ok(vec![DataBlock::empty()]); } self.is_finished = true; - if self.schemas.len() == 0 { + if self.schemas.is_empty() { return Ok(vec![DataBlock::empty()]); } let table_schema = if self.schemas.len() == 1 { From fb7fd0e74d000cf69a5abe2493c69b1d33ff695a Mon Sep 17 00:00:00 2001 From: kould Date: Tue, 9 Sep 2025 12:10:34 +0800 Subject: [PATCH 17/20] feat: impl `max_file_count` for `infer_schema` --- .../infer_schema/infer_schema_table.rs | 4 ++- .../table_functions/infer_schema/separator.rs | 29 +++++++++---------- .../infer_schema/table_args.rs | 6 ++++ tests/data/csv/max_file_count/numbers0.csv | 5 ++++ tests/data/csv/max_file_count/numbers1.csv | 4 +++ tests/data/csv/max_file_count/numbers2.csv | 4 +++ .../stage/formats/parquet/infer_schema.test | 9 ++++++ 7 files changed, 44 insertions(+), 17 deletions(-) create mode 100644 tests/data/csv/max_file_count/numbers0.csv create mode 100644 tests/data/csv/max_file_count/numbers1.csv create mode 100644 tests/data/csv/max_file_count/numbers2.csv diff --git a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs index fd4ae9c947fcd..5e1ad8ac61b55 100644 --- a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs +++ b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs @@ -189,7 +189,9 @@ impl Table for InferSchemaTable { None => stage_info.file_format_params.clone(), }; let operator = init_stage_operator(&stage_info)?; - let stage_file_infos = files_info.list(&operator, 1, None).await?; + let stage_file_infos = files_info + .list(&operator, 1, self.args_parsed.max_file_count) + .await?; Ok(( PartStatistics::default(), Partitions::create(PartitionsShuffleKind::Seq, vec![ diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs index a485d72356003..35bdbe76f3259 100644 --- a/src/query/service/src/table_functions/infer_schema/separator.rs +++ b/src/query/service/src/table_functions/infer_schema/separator.rs @@ -39,8 +39,8 @@ pub struct InferSchemaSeparator { pub file_format_params: FileFormatParams, files: HashMap>, pub max_records: Option, - schemas: Vec, - files_len: usize, + schemas: Option, + remaining_files_len: usize, is_finished: bool, } @@ -54,8 +54,8 @@ impl InferSchemaSeparator { file_format_params, files: HashMap::new(), max_records, - schemas: Vec::with_capacity(files_len), - files_len, + schemas: None, + remaining_files_len: files_len, is_finished: false, } } @@ -138,23 +138,20 @@ impl AccumulatingTransform for InferSchemaSeparator { } }; self.files.remove(&batch.path); - self.schemas.push(arrow_schema); - if self.schemas.len() != self.files_len { + let merge_schema = match self.schemas.take() { + None => TableSchema::try_from(&arrow_schema)?, + Some(schema) => merge_schema(schema, TableSchema::try_from(&arrow_schema)?), + }; + self.schemas = Some(merge_schema); + + self.remaining_files_len = self.remaining_files_len.checked_sub(1).unwrap_or(0); + if self.remaining_files_len > 0 { return Ok(vec![DataBlock::empty()]); } self.is_finished = true; - if self.schemas.is_empty() { + let Some(table_schema) = self.schemas.take() else { return Ok(vec![DataBlock::empty()]); - } - let table_schema = if self.schemas.len() == 1 { - TableSchema::try_from(&self.schemas.pop().unwrap())? - } else { - self.schemas[1..] - .iter() - .try_fold(TableSchema::try_from(&self.schemas[0])?, |acc, schema| { - TableSchema::try_from(schema).map(|schema| merge_schema(acc, schema)) - })? }; let mut names: Vec = vec![]; diff --git a/src/query/service/src/table_functions/infer_schema/table_args.rs b/src/query/service/src/table_functions/infer_schema/table_args.rs index 4bbf0ef113713..9781bc742ee4b 100644 --- a/src/query/service/src/table_functions/infer_schema/table_args.rs +++ b/src/query/service/src/table_functions/infer_schema/table_args.rs @@ -26,6 +26,7 @@ pub(crate) struct InferSchemaArgsParsed { pub(crate) file_format: Option, pub(crate) files_info: StageFilesInfo, pub(crate) max_records: Option, + pub(crate) max_file_count: Option, } impl InferSchemaArgsParsed { @@ -41,6 +42,7 @@ impl InferSchemaArgsParsed { pattern: None, }; let mut max_records = None; + let mut max_file_count = None; for (k, v) in &args { match k.to_lowercase().as_str() { @@ -59,6 +61,9 @@ impl InferSchemaArgsParsed { "max_records_pre_file" => { max_records = Some(i64_value(v)? as usize); } + "max_file_count" => { + max_file_count = Some(i64_value(v)? as usize); + } _ => { return Err(ErrorCode::BadArguments(format!( "unknown param {} for infer_schema", @@ -77,6 +82,7 @@ impl InferSchemaArgsParsed { file_format, files_info, max_records, + max_file_count, }) } } diff --git a/tests/data/csv/max_file_count/numbers0.csv b/tests/data/csv/max_file_count/numbers0.csv new file mode 100644 index 0000000000000..d0abce6450294 --- /dev/null +++ b/tests/data/csv/max_file_count/numbers0.csv @@ -0,0 +1,5 @@ +col1,col2,col3,col4,col5 +0,1,2,3,4 +5,6,7,8,9 +10,11,12,13,14 +a,b,c,d,e \ No newline at end of file diff --git a/tests/data/csv/max_file_count/numbers1.csv b/tests/data/csv/max_file_count/numbers1.csv new file mode 100644 index 0000000000000..a49bbf89b1d3d --- /dev/null +++ b/tests/data/csv/max_file_count/numbers1.csv @@ -0,0 +1,4 @@ +col1,col2,col3,col4,col5 +0,1,2,3,4 +5,6,7,8,9 +10,11,12,13,14 \ No newline at end of file diff --git a/tests/data/csv/max_file_count/numbers2.csv b/tests/data/csv/max_file_count/numbers2.csv new file mode 100644 index 0000000000000..a49bbf89b1d3d --- /dev/null +++ b/tests/data/csv/max_file_count/numbers2.csv @@ -0,0 +1,4 @@ +col1,col2,col3,col4,col5 +0,1,2,3,4 +5,6,7,8,9 +10,11,12,13,14 \ No newline at end of file diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index 558617d91adc0..a29842ce01cfe 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -144,6 +144,15 @@ col3 VARCHAR 1 2 col4 VARCHAR 1 3 col5 VARCHAR 1 4 +query TTBI +select * from infer_schema(location => '@data/csv/max_file_count/', file_format => 'head_csv_format', max_file_count => 2); +---- +col1 BIGINT 1 0 +col2 BIGINT 1 1 +col3 BIGINT 1 2 +col4 BIGINT 1 3 +col5 BIGINT 1 4 + # NDJSON query TTBI select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON'); From b26101eddaf32baa9e18a7249fff61e4ca041c23 Mon Sep 17 00:00:00 2001 From: kould Date: Tue, 9 Sep 2025 14:03:39 +0800 Subject: [PATCH 18/20] chore: codefmt --- src/query/service/src/table_functions/infer_schema/separator.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs index 35bdbe76f3259..c55b0429af645 100644 --- a/src/query/service/src/table_functions/infer_schema/separator.rs +++ b/src/query/service/src/table_functions/infer_schema/separator.rs @@ -145,7 +145,7 @@ impl AccumulatingTransform for InferSchemaSeparator { }; self.schemas = Some(merge_schema); - self.remaining_files_len = self.remaining_files_len.checked_sub(1).unwrap_or(0); + self.remaining_files_len = self.remaining_files_len.saturating_sub(1); if self.remaining_files_len > 0 { return Ok(vec![DataBlock::empty()]); } From fc6ce4be94a80b21cb7dc1a86a4210fc75c3cf71 Mon Sep 17 00:00:00 2001 From: kould Date: Tue, 9 Sep 2025 18:46:17 +0800 Subject: [PATCH 19/20] feat: impl `max_file_count` for `infer_schema` --- src/common/storage/src/stage.rs | 2 +- src/meta/app/src/principal/file_format.rs | 24 +- src/meta/app/src/principal/user_stage.rs | 10 +- src/query/ast/src/ast/statements/copy.rs | 4 +- .../infer_schema/infer_schema_table.rs | 1 + .../table_functions/infer_schema/parquet.rs | 9 + .../table_functions/infer_schema/separator.rs | 42 ++- src/query/storages/stage/src/infer_schema.rs | 9 +- tests/data/csv/max_file_count/numbers0.csv | 3 +- .../ndjson/max_file_count/numbers0.ndjson | 3 + .../ndjson/max_file_count/numbers1.ndjson | 3 + .../ndjson/max_file_count/numbers2.ndjson | 3 + .../parquet/max_file_count/tuple0.parquet | Bin 0 -> 2029 bytes .../parquet/max_file_count/tuple1.parquet | Bin 0 -> 2029 bytes .../parquet/max_file_count/tuple2.parquet | Bin 0 -> 2029 bytes .../stage/formats/parquet/infer_schema.test | 297 ++++++++++-------- 16 files changed, 240 insertions(+), 170 deletions(-) create mode 100644 tests/data/ndjson/max_file_count/numbers0.ndjson create mode 100644 tests/data/ndjson/max_file_count/numbers1.ndjson create mode 100644 tests/data/ndjson/max_file_count/numbers2.ndjson create mode 100644 tests/data/parquet/max_file_count/tuple0.parquet create mode 100644 tests/data/parquet/max_file_count/tuple1.parquet create mode 100644 tests/data/parquet/max_file_count/tuple2.parquet diff --git a/src/common/storage/src/stage.rs b/src/common/storage/src/stage.rs index 6b863ff4e5252..4ce56be4e1f67 100644 --- a/src/common/storage/src/stage.rs +++ b/src/common/storage/src/stage.rs @@ -98,7 +98,7 @@ pub fn init_stage_operator(stage_info: &StageInfo) -> Result { } /// select * from @s1/ (FILES => PATTERN => ) /// copy from @s1/ FILES = PATTERN => -#[derive(serde::Serialize, serde::Deserialize, Clone, PartialEq, Eq, Debug, Hash)] +#[derive(serde::Serialize, serde::Deserialize, Clone, PartialEq, Eq, Debug)] pub struct StageFilesInfo { pub path: String, pub files: Option>, diff --git a/src/meta/app/src/principal/file_format.rs b/src/meta/app/src/principal/file_format.rs index 8fc90ce74c79e..19e829c44e2ee 100644 --- a/src/meta/app/src/principal/file_format.rs +++ b/src/meta/app/src/principal/file_format.rs @@ -52,7 +52,7 @@ const OPT_BINARY_FORMAT: &str = "binary_format"; const OPT_USE_LOGIC_TYPE: &str = "use_logic_type"; /// File format parameters after checking and parsing. -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(tag = "type")] pub enum FileFormatParams { Csv(CsvFileFormatParams), @@ -446,7 +446,7 @@ impl FileFormatOptionsReader { } } -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct CsvFileFormatParams { pub compression: StageFileCompression, @@ -498,7 +498,7 @@ impl CsvFileFormatParams { } } -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct TsvFileFormatParams { pub compression: StageFileCompression, pub headers: u64, @@ -532,7 +532,7 @@ impl TsvFileFormatParams { } } -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct XmlFileFormatParams { pub compression: StageFileCompression, pub row_tag: String, @@ -558,7 +558,7 @@ impl Default for XmlFileFormatParams { /// used for both `missing_field_as` and `null_field_as` /// for extensibility, it is stored as PB string in meta -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] pub enum NullAs { /// for `missing_field_as` only, and is default for it for safety, /// in case of wrong field names when creating table. @@ -570,7 +570,7 @@ pub enum NullAs { FieldDefault, } -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] pub enum EmptyFieldAs { #[default] Null, @@ -638,7 +638,7 @@ impl Display for NullAs { } } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] pub enum BinaryFormat { #[default] Hex, @@ -668,7 +668,7 @@ impl Display for BinaryFormat { } } -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct JsonFileFormatParams { pub compression: StageFileCompression, } @@ -690,7 +690,7 @@ impl Default for JsonFileFormatParams { } } -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct NdJsonFileFormatParams { pub compression: StageFileCompression, pub missing_field_as: NullAs, @@ -741,7 +741,7 @@ impl NdJsonFileFormatParams { } } -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct AvroFileFormatParams { pub compression: StageFileCompression, pub missing_field_as: NullAs, @@ -791,7 +791,7 @@ impl AvroFileFormatParams { } } -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ParquetFileFormatParams { // used only for unload pub compression: StageFileCompression, @@ -828,7 +828,7 @@ impl ParquetFileFormatParams { } } -#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct OrcFileFormatParams { pub missing_field_as: NullAs, } diff --git a/src/meta/app/src/principal/user_stage.rs b/src/meta/app/src/principal/user_stage.rs index c2261b288c6d4..92da76b413c07 100644 --- a/src/meta/app/src/principal/user_stage.rs +++ b/src/meta/app/src/principal/user_stage.rs @@ -60,7 +60,7 @@ pub const COPY_MAX_FILES_PER_COMMIT: usize = 15000; /// Instruction for exceeding 'copy into table' file limit. pub const COPY_MAX_FILES_COMMIT_MSG: &str = "Commit limit reached: 15,000 files for 'copy into table'. To handle more files, adjust 'CopyOption' with 'max_files='(e.g., 'max_files=10000') and perform several operations until all files are processed."; -#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Eq, PartialEq, Hash)] +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Eq, PartialEq)] pub enum StageType { /// LegacyInternal will be deprecated. /// @@ -96,7 +96,7 @@ impl Default for StageType { } } -#[derive(serde::Serialize, serde::Deserialize, Clone, Copy, Debug, Eq, PartialEq, Hash)] +#[derive(serde::Serialize, serde::Deserialize, Clone, Copy, Debug, Eq, PartialEq)] pub enum StageFileCompression { Auto, Gzip, @@ -396,13 +396,13 @@ impl Display for FileFormatOptions { } } -#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq, Hash)] +#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq)] #[serde(default)] pub struct StageParams { pub storage: StorageParams, } -#[derive(serde::Serialize, serde::Deserialize, Clone, Default, Debug, Eq, PartialEq, Hash)] +#[derive(serde::Serialize, serde::Deserialize, Clone, Default, Debug, Eq, PartialEq)] #[serde(default)] pub struct CopyOptions { pub on_error: OnErrorMode, @@ -419,7 +419,7 @@ pub struct CopyOptions { pub detailed_output: bool, } -#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq, Hash)] +#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq)] #[serde(default)] pub struct StageInfo { pub stage_name: String, diff --git a/src/query/ast/src/ast/statements/copy.rs b/src/query/ast/src/ast/statements/copy.rs index b207b05d7d879..8e10e37318270 100644 --- a/src/query/ast/src/ast/statements/copy.rs +++ b/src/query/ast/src/ast/statements/copy.rs @@ -648,9 +648,7 @@ impl Display for FileFormatValue { } } -#[derive( - serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq, Hash, -)] +#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq)] pub enum OnErrorMode { Continue, SkipFileNum(u64), diff --git a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs index 5e1ad8ac61b55..f946aead234e0 100644 --- a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs +++ b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs @@ -102,6 +102,7 @@ impl InferSchemaTable { TableField::new("column_name", TableDataType::String), TableField::new("type", TableDataType::String), TableField::new("nullable", TableDataType::Boolean), + TableField::new("filenames", TableDataType::String), TableField::new("order_id", TableDataType::Number(NumberDataType::UInt64)), ]) } diff --git a/src/query/service/src/table_functions/infer_schema/parquet.rs b/src/query/service/src/table_functions/infer_schema/parquet.rs index 38e15819d0eb7..5db9713b9ac69 100644 --- a/src/query/service/src/table_functions/infer_schema/parquet.rs +++ b/src/query/service/src/table_functions/infer_schema/parquet.rs @@ -32,6 +32,7 @@ use databend_common_storage::init_stage_operator; use databend_common_storage::read_parquet_schema_async_rs; use databend_common_storage::StageFileInfo; use futures_util::future::try_join_all; +use itertools::Itertools; use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA; @@ -79,6 +80,12 @@ impl AsyncSource for ParquetInferSchemaSource { let mut names: Vec = vec![]; let mut types: Vec = vec![]; let mut nulls: Vec = vec![]; + let mut filenames: Vec = vec![]; + let filenames_str = self + .stage_file_infos + .iter() + .map(|info| &info.path) + .join(", "); for field in table_schema.fields().iter() { names.push(field.name().to_string()); @@ -86,6 +93,7 @@ impl AsyncSource for ParquetInferSchemaSource { let non_null_type = field.data_type().remove_recursive_nullable(); types.push(non_null_type.sql_name()); nulls.push(field.is_nullable()); + filenames.push(filenames_str.clone()); } let order_ids = (0..table_schema.fields().len() as u64).collect::>(); @@ -94,6 +102,7 @@ impl AsyncSource for ParquetInferSchemaSource { StringType::from_data(names), StringType::from_data(types), BooleanType::from_data(nulls), + StringType::from_data(filenames), UInt64Type::from_data(order_ids), ]); Ok(Some(block)) diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs index c55b0429af645..b5607f0a3b9f1 100644 --- a/src/query/service/src/table_functions/infer_schema/separator.rs +++ b/src/query/service/src/table_functions/infer_schema/separator.rs @@ -32,15 +32,19 @@ use databend_common_expression::TableSchema; use databend_common_meta_app::principal::FileFormatParams; use databend_common_pipeline_transforms::AccumulatingTransform; use databend_common_storages_stage::BytesBatch; +use itertools::Itertools; use crate::table_functions::infer_schema::merge::merge_schema; +const MAX_SINGLE_FILE_BYTES: usize = 100 * 1024 * 1024; + pub struct InferSchemaSeparator { pub file_format_params: FileFormatParams, files: HashMap>, pub max_records: Option, schemas: Option, - remaining_files_len: usize, + files_len: usize, + filenames: Vec, is_finished: bool, } @@ -55,7 +59,8 @@ impl InferSchemaSeparator { files: HashMap::new(), max_records, schemas: None, - remaining_files_len: files_len, + files_len, + filenames: Vec::with_capacity(files_len), is_finished: false, } } @@ -76,6 +81,14 @@ impl AccumulatingTransform for InferSchemaSeparator { let bytes = self.files.entry(batch.path.clone()).or_default(); bytes.extend(batch.data); + if bytes.len() > MAX_SINGLE_FILE_BYTES { + return Err(ErrorCode::InvalidArgument(format!( + "The file '{}' is too large(maximum allowed: {})", + batch.path, + human_readable_size(MAX_SINGLE_FILE_BYTES), + ))); + } + // When max_records exists, it will try to use the current bytes to read, otherwise it will buffer all bytes if self.max_records.is_none() && !batch.is_eof { return Ok(vec![DataBlock::empty()]); @@ -138,6 +151,7 @@ impl AccumulatingTransform for InferSchemaSeparator { } }; self.files.remove(&batch.path); + self.filenames.push(batch.path); let merge_schema = match self.schemas.take() { None => TableSchema::try_from(&arrow_schema)?, @@ -145,8 +159,7 @@ impl AccumulatingTransform for InferSchemaSeparator { }; self.schemas = Some(merge_schema); - self.remaining_files_len = self.remaining_files_len.saturating_sub(1); - if self.remaining_files_len > 0 { + if self.files_len > self.filenames.len() { return Ok(vec![DataBlock::empty()]); } self.is_finished = true; @@ -157,6 +170,8 @@ impl AccumulatingTransform for InferSchemaSeparator { let mut names: Vec = vec![]; let mut types: Vec = vec![]; let mut nulls: Vec = vec![]; + let mut filenames: Vec = vec![]; + let filenames_str = self.filenames.iter().join(", "); for field in table_schema.fields().iter() { names.push(field.name().to_string()); @@ -164,6 +179,7 @@ impl AccumulatingTransform for InferSchemaSeparator { let non_null_type = field.data_type().remove_recursive_nullable(); types.push(non_null_type.sql_name()); nulls.push(field.is_nullable()); + filenames.push(filenames_str.clone()); } let order_ids = (0..table_schema.fields().len() as u64).collect::>(); @@ -172,8 +188,26 @@ impl AccumulatingTransform for InferSchemaSeparator { StringType::from_data(names), StringType::from_data(types), BooleanType::from_data(nulls), + StringType::from_data(filenames), UInt64Type::from_data(order_ids), ]); Ok(vec![block]) } } + +fn human_readable_size(bytes: usize) -> String { + const KB: f64 = 1024.0; + const MB: f64 = KB * 1024.0; + const GB: f64 = MB * 1024.0; + + let b = bytes as f64; + if b >= GB { + format!("{:.2} GB", b / GB) + } else if b >= MB { + format!("{:.2} MB", b / MB) + } else if b >= KB { + format!("{:.2} KB", b / KB) + } else { + format!("{} B", bytes) + } +} diff --git a/src/query/storages/stage/src/infer_schema.rs b/src/query/storages/stage/src/infer_schema.rs index 5a3c25a6c4910..77a961594992e 100644 --- a/src/query/storages/stage/src/infer_schema.rs +++ b/src/query/storages/stage/src/infer_schema.rs @@ -13,9 +13,6 @@ // limitations under the License. use std::any::Any; -use std::hash::DefaultHasher; -use std::hash::Hash; -use std::hash::Hasher; use std::sync::Arc; use databend_common_catalog::plan::PartInfo; @@ -48,11 +45,7 @@ impl PartInfo for InferSchemaPartInfo { } fn hash(&self) -> u64 { - let mut s = DefaultHasher::new(); - self.files_info.hash(&mut s); - self.file_format_params.hash(&mut s); - self.stage_info.hash(&mut s); - s.finish() + 0 } fn part_type(&self) -> PartInfoType { diff --git a/tests/data/csv/max_file_count/numbers0.csv b/tests/data/csv/max_file_count/numbers0.csv index d0abce6450294..a49bbf89b1d3d 100644 --- a/tests/data/csv/max_file_count/numbers0.csv +++ b/tests/data/csv/max_file_count/numbers0.csv @@ -1,5 +1,4 @@ col1,col2,col3,col4,col5 0,1,2,3,4 5,6,7,8,9 -10,11,12,13,14 -a,b,c,d,e \ No newline at end of file +10,11,12,13,14 \ No newline at end of file diff --git a/tests/data/ndjson/max_file_count/numbers0.ndjson b/tests/data/ndjson/max_file_count/numbers0.ndjson new file mode 100644 index 0000000000000..aecddc3762d07 --- /dev/null +++ b/tests/data/ndjson/max_file_count/numbers0.ndjson @@ -0,0 +1,3 @@ +{"id": 1, "value": 100} +{"id": 2, "value": 200} +{"id": 3, "value": 300} diff --git a/tests/data/ndjson/max_file_count/numbers1.ndjson b/tests/data/ndjson/max_file_count/numbers1.ndjson new file mode 100644 index 0000000000000..aecddc3762d07 --- /dev/null +++ b/tests/data/ndjson/max_file_count/numbers1.ndjson @@ -0,0 +1,3 @@ +{"id": 1, "value": 100} +{"id": 2, "value": 200} +{"id": 3, "value": 300} diff --git a/tests/data/ndjson/max_file_count/numbers2.ndjson b/tests/data/ndjson/max_file_count/numbers2.ndjson new file mode 100644 index 0000000000000..aecddc3762d07 --- /dev/null +++ b/tests/data/ndjson/max_file_count/numbers2.ndjson @@ -0,0 +1,3 @@ +{"id": 1, "value": 100} +{"id": 2, "value": 200} +{"id": 3, "value": 300} diff --git a/tests/data/parquet/max_file_count/tuple0.parquet b/tests/data/parquet/max_file_count/tuple0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..53ccb995f5badcb5bb0be445de959d8102bf44e4 GIT binary patch literal 2029 zcmb_d-EQJW6dnU>p$e%{wMIrl6|%Z07i82Deq@zkrEtTrdF= zt<(qTWz~yb^s<+|=mYd2D)lX@Y9FEJj3FWEk6LxDcs%~TGw1u}%rQBpoq`Y(L?PW4 zc!9h}NFv7(LLzWS;TDJx7ZN$(w&3Q|Vhm9+hA~O9z3~P>ED?!*K(aq@X(5s-6Mict zMr>1vB>CjGT=IJ^`$;zbPZ?4-+(sCz#nULevc(JU{7YNWI2Yx! zzeI0cZF2!fnXRw6B0n{Tma7RdsMr&Nl_r34!5ZKw5fVUf z@)OcdgL0BjrBg57=JCAcqGta`QIg+9lYBOw{3-gFkW^0Oea$nqsrYN;MzTHATAUkW zXXbiS>3Qa}FLD_vd_8BJ^#EnBHe5lgW6F%2(E8YL7ic3#Fa>PEi7P(n@@TEbbI`O44g0CTOSgQh6RWq4*pB zAzr0Hl~Q^FH(sAoTB7oR)+rs(>VRRge2Edj@9U1 zH%H@vQpiu6)z(NaE(*F*4zb;-t-JL=J#Fp*=d0Bw#<-f&ovQW9tgjRtTnl=aO&fB1 z@(^+kjY1`F_2e}-GxradjCacgy(gD@yPdMpI0H}A)|7JoXmnz?tJKz&ZoaQ{hGwIN zerKk#m|lQqYuUQf^1@ivbsW9Xx!CZR|9cM=TXVZ3GxQI63v;Wx>G2*PsJ+fmZPfF9 z+;;;!Hx`vu?Ut%i&+EHQ^apib_;8c`w#Ght5Bm?t1041ZY|Y6r42(WwJX?R;st|@~ZQWWYq2AG}wO*AqrOcLw{R!b= z7h+3Oy130>V@(P{XRCfa!2X46l@zCRm2bm8gHd1hRScjM8hU_RWTqH>%Eph0nDMpA jmoqDPYWV(B_7P`RA8981d{ZyrCqMin1)h*&_(%T-pz+dT literal 0 HcmV?d00001 diff --git a/tests/data/parquet/max_file_count/tuple1.parquet b/tests/data/parquet/max_file_count/tuple1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..53ccb995f5badcb5bb0be445de959d8102bf44e4 GIT binary patch literal 2029 zcmb_d-EQJW6dnU>p$e%{wMIrl6|%Z07i82Deq@zkrEtTrdF= zt<(qTWz~yb^s<+|=mYd2D)lX@Y9FEJj3FWEk6LxDcs%~TGw1u}%rQBpoq`Y(L?PW4 zc!9h}NFv7(LLzWS;TDJx7ZN$(w&3Q|Vhm9+hA~O9z3~P>ED?!*K(aq@X(5s-6Mict zMr>1vB>CjGT=IJ^`$;zbPZ?4-+(sCz#nULevc(JU{7YNWI2Yx! zzeI0cZF2!fnXRw6B0n{Tma7RdsMr&Nl_r34!5ZKw5fVUf z@)OcdgL0BjrBg57=JCAcqGta`QIg+9lYBOw{3-gFkW^0Oea$nqsrYN;MzTHATAUkW zXXbiS>3Qa}FLD_vd_8BJ^#EnBHe5lgW6F%2(E8YL7ic3#Fa>PEi7P(n@@TEbbI`O44g0CTOSgQh6RWq4*pB zAzr0Hl~Q^FH(sAoTB7oR)+rs(>VRRge2Edj@9U1 zH%H@vQpiu6)z(NaE(*F*4zb;-t-JL=J#Fp*=d0Bw#<-f&ovQW9tgjRtTnl=aO&fB1 z@(^+kjY1`F_2e}-GxradjCacgy(gD@yPdMpI0H}A)|7JoXmnz?tJKz&ZoaQ{hGwIN zerKk#m|lQqYuUQf^1@ivbsW9Xx!CZR|9cM=TXVZ3GxQI63v;Wx>G2*PsJ+fmZPfF9 z+;;;!Hx`vu?Ut%i&+EHQ^apib_;8c`w#Ght5Bm?t1041ZY|Y6r42(WwJX?R;st|@~ZQWWYq2AG}wO*AqrOcLw{R!b= z7h+3Oy130>V@(P{XRCfa!2X46l@zCRm2bm8gHd1hRScjM8hU_RWTqH>%Eph0nDMpA jmoqDPYWV(B_7P`RA8981d{ZyrCqMin1)h*&_(%T-pz+dT literal 0 HcmV?d00001 diff --git a/tests/data/parquet/max_file_count/tuple2.parquet b/tests/data/parquet/max_file_count/tuple2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..53ccb995f5badcb5bb0be445de959d8102bf44e4 GIT binary patch literal 2029 zcmb_d-EQJW6dnU>p$e%{wMIrl6|%Z07i82Deq@zkrEtTrdF= zt<(qTWz~yb^s<+|=mYd2D)lX@Y9FEJj3FWEk6LxDcs%~TGw1u}%rQBpoq`Y(L?PW4 zc!9h}NFv7(LLzWS;TDJx7ZN$(w&3Q|Vhm9+hA~O9z3~P>ED?!*K(aq@X(5s-6Mict zMr>1vB>CjGT=IJ^`$;zbPZ?4-+(sCz#nULevc(JU{7YNWI2Yx! zzeI0cZF2!fnXRw6B0n{Tma7RdsMr&Nl_r34!5ZKw5fVUf z@)OcdgL0BjrBg57=JCAcqGta`QIg+9lYBOw{3-gFkW^0Oea$nqsrYN;MzTHATAUkW zXXbiS>3Qa}FLD_vd_8BJ^#EnBHe5lgW6F%2(E8YL7ic3#Fa>PEi7P(n@@TEbbI`O44g0CTOSgQh6RWq4*pB zAzr0Hl~Q^FH(sAoTB7oR)+rs(>VRRge2Edj@9U1 zH%H@vQpiu6)z(NaE(*F*4zb;-t-JL=J#Fp*=d0Bw#<-f&ovQW9tgjRtTnl=aO&fB1 z@(^+kjY1`F_2e}-GxradjCacgy(gD@yPdMpI0H}A)|7JoXmnz?tJKz&ZoaQ{hGwIN zerKk#m|lQqYuUQf^1@ivbsW9Xx!CZR|9cM=TXVZ3GxQI63v;Wx>G2*PsJ+fmZPfF9 z+;;;!Hx`vu?Ut%i&+EHQ^apib_;8c`w#Ght5Bm?t1041ZY|Y6r42(WwJX?R;st|@~ZQWWYq2AG}wO*AqrOcLw{R!b= z7h+3Oy130>V@(P{XRCfa!2X46l@zCRm2bm8gHd1hRScjM8hU_RWTqH>%Eph0nDMpA jmoqDPYWV(B_7P`RA8981d{ZyrCqMin1)h*&_(%T-pz+dT literal 0 HcmV?d00001 diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index a29842ce01cfe..cf67ad1ea49ca 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -5,55 +5,55 @@ select * from infer_schema(location => '@data/invalid_xxx/tuple.parquet') query select * from infer_schema(location => '@data/parquet/tuple.parquet') ---- -id INT 0 0 -t TUPLE(A INT32, B STRING) 0 1 +id INT 0 parquet/tuple.parquet 0 +t TUPLE(A INT32, B STRING) 0 parquet/tuple.parquet 1 query select * from infer_schema(location => '@data/parquet/complex.parquet') ---- -resourceType VARCHAR 1 0 -id VARCHAR 1 1 -meta TUPLE(ID STRING, EXTENSION ARRAY(STRING), VERSIONID STRING, LASTUPDATED TIMESTAMP, SOURCE STRING, PROFILE ARRAY(STRING), SECURITY ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TAG ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN))) 1 2 -implicitRules VARCHAR 1 3 -language VARCHAR 1 4 -text TUPLE(ID STRING, EXTENSION ARRAY(STRING), STATUS STRING, DIV STRING) 1 5 -contained ARRAY(STRING) 1 6 -extension ARRAY(STRING) 1 7 -modifierExtension ARRAY(STRING) 1 8 -identifier ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING))) 1 9 -active BOOLEAN 1 10 -name ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TEXT STRING, FAMILY STRING, GIVEN ARRAY(STRING), PREFIX ARRAY(STRING), SUFFIX ARRAY(STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 11 -telecom ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VALUE STRING, USE STRING, RANK INT32, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 12 -gender VARCHAR 1 13 -birthDate DATE 1 14 -deceasedBoolean BOOLEAN 1 15 -deceasedDateTime TIMESTAMP 1 16 -address ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE STRING, TEXT STRING, LINE ARRAY(STRING), CITY STRING, DISTRICT STRING, STATE STRING, POSTALCODE STRING, COUNTRY STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 17 -maritalStatus TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING) 1 18 -multipleBirthBoolean BOOLEAN 1 19 -multipleBirthInteger INT 1 20 -photo ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), CONTENTTYPE STRING, LANGUAGE STRING, DATA BINARY, URL STRING, SIZE INT32, HASH BINARY, TITLE STRING, CREATION TIMESTAMP)) 1 21 -contact ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), RELATIONSHIP ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING)), NAME TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TEXT STRING, FAMILY STRING, GIVEN ARRAY(STRING), PREFIX ARRAY(STRING), SUFFIX ARRAY(STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP)), TELECOM ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VALUE STRING, USE STRING, RANK INT32, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))), ADDRESS TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE STRING, TEXT STRING, LINE ARRAY(STRING), CITY STRING, DISTRICT STRING, STATE STRING, POSTALCODE STRING, COUNTRY STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP)), GENDER STRING, ORGANIZATION TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 22 -communication ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), LANGUAGE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), PREFERRED BOOLEAN)) 1 23 -generalPractitioner ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING)) 1 24 -managingOrganization TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING) 1 25 -link ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), OTHER TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING), TYPE STRING)) 1 26 -yy__version INT 1 27 -yy__us_core_race VARCHAR 1 28 -yy__us_core_ethnicity VARCHAR 1 29 -yy__us_core_birthsex TUPLE(VALUECODE STRING,) 1 30 +resourceType VARCHAR 1 parquet/complex.parquet 0 +id VARCHAR 1 parquet/complex.parquet 1 +meta TUPLE(ID STRING, EXTENSION ARRAY(STRING), VERSIONID STRING, LASTUPDATED TIMESTAMP, SOURCE STRING, PROFILE ARRAY(STRING), SECURITY ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TAG ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN))) 1 parquet/complex.parquet 2 +implicitRules VARCHAR 1 parquet/complex.parquet 3 +language VARCHAR 1 parquet/complex.parquet 4 +text TUPLE(ID STRING, EXTENSION ARRAY(STRING), STATUS STRING, DIV STRING) 1 parquet/complex.parquet 5 +contained ARRAY(STRING) 1 parquet/complex.parquet 6 +extension ARRAY(STRING) 1 parquet/complex.parquet 7 +modifierExtension ARRAY(STRING) 1 parquet/complex.parquet 8 +identifier ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING))) 1 parquet/complex.parquet 9 +active BOOLEAN 1 parquet/complex.parquet 10 +name ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TEXT STRING, FAMILY STRING, GIVEN ARRAY(STRING), PREFIX ARRAY(STRING), SUFFIX ARRAY(STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 parquet/complex.parquet 11 +telecom ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VALUE STRING, USE STRING, RANK INT32, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 parquet/complex.parquet 12 +gender VARCHAR 1 parquet/complex.parquet 13 +birthDate DATE 1 parquet/complex.parquet 14 +deceasedBoolean BOOLEAN 1 parquet/complex.parquet 15 +deceasedDateTime TIMESTAMP 1 parquet/complex.parquet 16 +address ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE STRING, TEXT STRING, LINE ARRAY(STRING), CITY STRING, DISTRICT STRING, STATE STRING, POSTALCODE STRING, COUNTRY STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 parquet/complex.parquet 17 +maritalStatus TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING) 1 parquet/complex.parquet 18 +multipleBirthBoolean BOOLEAN 1 parquet/complex.parquet 19 +multipleBirthInteger INT 1 parquet/complex.parquet 20 +photo ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), CONTENTTYPE STRING, LANGUAGE STRING, DATA BINARY, URL STRING, SIZE INT32, HASH BINARY, TITLE STRING, CREATION TIMESTAMP)) 1 parquet/complex.parquet 21 +contact ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), RELATIONSHIP ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING)), NAME TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TEXT STRING, FAMILY STRING, GIVEN ARRAY(STRING), PREFIX ARRAY(STRING), SUFFIX ARRAY(STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP)), TELECOM ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VALUE STRING, USE STRING, RANK INT32, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))), ADDRESS TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE STRING, TEXT STRING, LINE ARRAY(STRING), CITY STRING, DISTRICT STRING, STATE STRING, POSTALCODE STRING, COUNTRY STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP)), GENDER STRING, ORGANIZATION TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 parquet/complex.parquet 22 +communication ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), LANGUAGE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), PREFERRED BOOLEAN)) 1 parquet/complex.parquet 23 +generalPractitioner ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING)) 1 parquet/complex.parquet 24 +managingOrganization TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING) 1 parquet/complex.parquet 25 +link ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), OTHER TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING), TYPE STRING)) 1 parquet/complex.parquet 26 +yy__version INT 1 parquet/complex.parquet 27 +yy__us_core_race VARCHAR 1 parquet/complex.parquet 28 +yy__us_core_ethnicity VARCHAR 1 parquet/complex.parquet 29 +yy__us_core_birthsex TUPLE(VALUECODE STRING,) 1 parquet/complex.parquet 30 query select * from infer_schema(location => '@data/parquet/variant.parquet') ---- -a INT 0 0 -b VARIANT 0 1 +a INT 0 parquet/variant.parquet 0 +b VARIANT 0 parquet/variant.parquet 1 query select * from infer_schema(location => '@data/parquet/', FILE_FORMAT => 'PARQUET', pattern => 'tuple.*') ---- -id INT 0 0 -t TUPLE(A INT32, B STRING) 0 1 +id INT 0 parquet/tuple.parquet 0 +t TUPLE(A INT32, B STRING) 0 parquet/tuple.parquet 1 statement ok drop CONNECTION IF EXISTS my_conn @@ -61,168 +61,195 @@ drop CONNECTION IF EXISTS my_conn statement ok create CONNECTION my_conn STORAGE_TYPE = 's3' access_key_id='minioadmin' secret_access_key='minioadmin' endpoint_url='http://127.0.0.1:9900/' region='auto' -query -select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn') +# query +# select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn') +# ---- +# id INT 0 parquet/tuple.parquet 0 +# t TUPLE(A INT32, B STRING) 0 parquet/tuple.parquet 1 + +query T +select CASE + WHEN filenames LIKE '%,%' + THEN 'Y' + ELSE 'N' + END AS format_check +from infer_schema(location => '@data/parquet/max_file_count', max_file_count => 2) ---- -id INT 0 0 -t TUPLE(A INT32, B STRING) 0 1 +Y +Y # CSV statement ok create or replace file format head_csv_format type = 'CSV' field_delimiter = ',' skip_header = 1; -query TTBI +query TTBTI select * from infer_schema(location => '@data/csv/numbers_with_headers.csv', file_format => 'CSV'); ---- -column_1 VARCHAR 1 0 -column_2 VARCHAR 1 1 +column_1 VARCHAR 1 csv/numbers_with_headers.csv 0 +column_2 VARCHAR 1 csv/numbers_with_headers.csv 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/csv/numbers_with_headers.csv', file_format => 'head_csv_format'); ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 csv/numbers_with_headers.csv 0 +value BIGINT 1 csv/numbers_with_headers.csv 1 statement error select * from infer_schema(location => '@data/csv/ragged.csv', file_format => 'head_csv_format'); -query TTBI +query TTBTI select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format'); ---- -id BIGINT 1 0 -value VARCHAR 1 1 +id BIGINT 1 csv/max_records.csv 0 +value VARCHAR 1 csv/max_records.csv 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5); ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 csv/max_records.csv 0 +value BIGINT 1 csv/max_records.csv 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/csv/max_records.zip', file_format => 'head_csv_format', max_records_pre_file => 5); ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 csv/max_records.zip 0 +value BIGINT 1 csv/max_records.zip 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/csv/max_records.zst', file_format => 'head_csv_format', max_records_pre_file => 5); ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 csv/max_records.zst 0 +value BIGINT 1 csv/max_records.zst 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5); ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 csv/max_records.csv 0 +value BIGINT 1 csv/max_records.csv 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/csv/max_records.xz', file_format => 'head_csv_format', max_records_pre_file => 5); ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 csv/max_records.xz 0 +value BIGINT 1 csv/max_records.xz 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/csv/types.csv', file_format => 'head_csv_format') ---- -bool_col BOOLEAN 1 0 -int_col BIGINT 1 1 -float_col DOUBLE 1 2 -date_col DATE 1 3 -ts_sec TIMESTAMP 1 4 -ts_ms TIMESTAMP 1 5 -ts_us TIMESTAMP 1 6 -ts_ns TIMESTAMP 1 7 -utf8_col VARCHAR 1 8 - -query TTBI +bool_col BOOLEAN 1 csv/types.csv 0 +int_col BIGINT 1 csv/types.csv 1 +float_col DOUBLE 1 csv/types.csv 2 +date_col DATE 1 csv/types.csv 3 +ts_sec TIMESTAMP 1 csv/types.csv 4 +ts_ms TIMESTAMP 1 csv/types.csv 5 +ts_us TIMESTAMP 1 csv/types.csv 6 +ts_ns TIMESTAMP 1 csv/types.csv 7 +utf8_col VARCHAR 1 csv/types.csv 8 + +query TTBTI select * from infer_schema(location => '@data/csv/merge/', file_format => 'head_csv_format'); ---- -col1 VARCHAR 1 0 -col2 VARCHAR 1 1 -col3 VARCHAR 1 2 -col4 VARCHAR 1 3 -col5 VARCHAR 1 4 - -query TTBI -select * from infer_schema(location => '@data/csv/max_file_count/', file_format => 'head_csv_format', max_file_count => 2); ----- -col1 BIGINT 1 0 -col2 BIGINT 1 1 -col3 BIGINT 1 2 -col4 BIGINT 1 3 -col5 BIGINT 1 4 +col1 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 0 +col2 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 1 +col3 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 2 +col4 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 3 +col5 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 4 + +query T +select CASE + WHEN filenames LIKE '%,%' + THEN 'Y' + ELSE 'N' + END AS format_check +from infer_schema(location => '@data/csv/max_file_count/', file_format => 'head_csv_format', max_file_count => 2); +---- +Y +Y +Y +Y +Y # NDJSON -query TTBI +query TTBTI select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON'); ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 ndjson/numbers.ndjson 0 +value BIGINT 1 ndjson/numbers.ndjson 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/ndjson/ragged.ndjson', file_format => 'NDJSON'); ---- -id BIGINT 1 0 -value BIGINT 1 1 -comment VARCHAR 1 2 +id BIGINT 1 ndjson/ragged.ndjson 0 +value BIGINT 1 ndjson/ragged.ndjson 1 +comment VARCHAR 1 ndjson/ragged.ndjson 2 -query TTBI +query TTBTI select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON'); ---- -id BIGINT 1 0 -value VARCHAR 1 1 +id BIGINT 1 ndjson/max_records.ndjson 0 +value VARCHAR 1 ndjson/max_records.ndjson 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5); ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 ndjson/max_records.ndjson 0 +value BIGINT 1 ndjson/max_records.ndjson 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/ndjson/max_records.zip', file_format => 'NDJSON', max_records_pre_file => 5); ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 ndjson/max_records.zip 0 +value BIGINT 1 ndjson/max_records.zip 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/ndjson/max_records.zst', file_format => 'NDJSON', max_records_pre_file => 5); ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 ndjson/max_records.zst 0 +value BIGINT 1 ndjson/max_records.zst 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5); ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 ndjson/max_records.ndjson 0 +value BIGINT 1 ndjson/max_records.ndjson 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/ndjson/max_records.xz', file_format => 'NDJSON', max_records_pre_file => 5) ---- -id BIGINT 1 0 -value BIGINT 1 1 +id BIGINT 1 ndjson/max_records.xz 0 +value BIGINT 1 ndjson/max_records.xz 1 -query TTBI +query TTBTI select * from infer_schema(location => '@data/ndjson/types.ndjson', file_format => 'NDJSON') ---- -bool_col BOOLEAN 1 0 -int_col BIGINT 1 1 -float_col DOUBLE 1 2 -date_col VARCHAR 1 3 -ts_sec VARCHAR 1 4 -ts_ms VARCHAR 1 5 -ts_us VARCHAR 1 6 -ts_ns VARCHAR 1 7 -utf8_col VARCHAR 1 8 -arr_col ARRAY(STRING) 1 9 -obj_col TUPLE(A INT64, B STRING) 1 10 - -query TTBI +bool_col BOOLEAN 1 ndjson/types.ndjson 0 +int_col BIGINT 1 ndjson/types.ndjson 1 +float_col DOUBLE 1 ndjson/types.ndjson 2 +date_col VARCHAR 1 ndjson/types.ndjson 3 +ts_sec VARCHAR 1 ndjson/types.ndjson 4 +ts_ms VARCHAR 1 ndjson/types.ndjson 5 +ts_us VARCHAR 1 ndjson/types.ndjson 6 +ts_ns VARCHAR 1 ndjson/types.ndjson 7 +utf8_col VARCHAR 1 ndjson/types.ndjson 8 +arr_col ARRAY(STRING) 1 ndjson/types.ndjson 9 +obj_col TUPLE(A INT64, B STRING) 1 ndjson/types.ndjson 10 + +query TTBTI select * from infer_schema(location => '@data/ndjson/merge/', file_format => 'NDJSON'); ---- -col1 VARCHAR 1 0 -col2 VARCHAR 1 1 -col3 VARCHAR 1 2 -col4 VARCHAR 1 3 -col5 VARCHAR 1 4 +col1 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 0 +col2 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 1 +col3 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 2 +col4 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 3 +col5 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 4 + +query T +select CASE + WHEN filenames LIKE '%,%' + THEN 'Y' + ELSE 'N' + END AS format_check +from infer_schema(location => '@data/ndjson/max_file_count/', file_format => 'NDJSON', max_file_count => 2); +---- +Y +Y \ No newline at end of file From 4b6ef6d8e9cc9a0e77feeef7c46c0e07c6d87322 Mon Sep 17 00:00:00 2001 From: kould Date: Tue, 9 Sep 2025 20:34:36 +0800 Subject: [PATCH 20/20] chore: codefmt --- .../stage/formats/parquet/infer_schema.test | 24 +++++++++---------- .../options/parquet_missing_field.test | 14 +++++------ .../formats/parquet/parquet_field_types.test | 14 +++++------ 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test index cf67ad1ea49ca..9113d03729c9e 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test @@ -147,13 +147,13 @@ ts_ns TIMESTAMP 1 csv/types.csv 7 utf8_col VARCHAR 1 csv/types.csv 8 query TTBTI -select * from infer_schema(location => '@data/csv/merge/', file_format => 'head_csv_format'); +select column_name, type, nullable, order_id from infer_schema(location => '@data/csv/merge/', file_format => 'head_csv_format'); ---- -col1 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 0 -col2 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 1 -col3 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 2 -col4 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 3 -col5 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 4 +col1 VARCHAR 1 0 +col2 VARCHAR 1 1 +col3 VARCHAR 1 2 +col4 VARCHAR 1 3 +col5 VARCHAR 1 4 query T select CASE @@ -235,13 +235,13 @@ arr_col ARRAY(STRING) 1 ndjson/types.ndjson 9 obj_col TUPLE(A INT64, B STRING) 1 ndjson/types.ndjson 10 query TTBTI -select * from infer_schema(location => '@data/ndjson/merge/', file_format => 'NDJSON'); +select column_name, type, nullable, order_id from infer_schema(location => '@data/ndjson/merge/', file_format => 'NDJSON'); ---- -col1 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 0 -col2 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 1 -col3 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 2 -col4 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 3 -col5 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 4 +col1 VARCHAR 1 0 +col2 VARCHAR 1 1 +col3 VARCHAR 1 2 +col4 VARCHAR 1 3 +col5 VARCHAR 1 4 query T select CASE diff --git a/tests/sqllogictests/suites/stage/formats/parquet/options/parquet_missing_field.test b/tests/sqllogictests/suites/stage/formats/parquet/options/parquet_missing_field.test index 522f9167dc50b..9e5e2344678a3 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/options/parquet_missing_field.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/options/parquet_missing_field.test @@ -7,17 +7,17 @@ create table t1 (c1 int, c2 int, c3 int64, c4 string default 'ok') query select * from infer_schema(location => '@data/parquet/diff_schema/f1.parquet') ---- -c1 BIGINT 1 0 -c2 SMALLINT 1 1 -c3 BIGINT 1 2 +c1 BIGINT 1 parquet/diff_schema/f1.parquet 0 +c2 SMALLINT 1 parquet/diff_schema/f1.parquet 1 +c3 BIGINT 1 parquet/diff_schema/f1.parquet 2 query select * from infer_schema(location => '@data/parquet/diff_schema/f2.parquet') ---- -c6 BIGINT 1 0 -c5 BIGINT 1 1 -c2 BIGINT 1 2 -c4 VARCHAR 1 3 +c6 BIGINT 1 parquet/diff_schema/f2.parquet 0 +c5 BIGINT 1 parquet/diff_schema/f2.parquet 1 +c2 BIGINT 1 parquet/diff_schema/f2.parquet 2 +c4 VARCHAR 1 parquet/diff_schema/f2.parquet 3 query error copy into t1 from @data/parquet/diff_schema/ file_format=(type=parquet) pattern='.*[.]parquet' diff --git a/tests/sqllogictests/suites/stage/formats/parquet/parquet_field_types.test b/tests/sqllogictests/suites/stage/formats/parquet/parquet_field_types.test index 1565b70444b99..94e2bc39feb0c 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/parquet_field_types.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/parquet_field_types.test @@ -106,13 +106,13 @@ NULL query select * from infer_schema (location => '@data/parquet/int96.parquet') ---- -id VARCHAR 1 0 -t_bool BOOLEAN 1 1 -t_float FLOAT 1 2 -t_double DOUBLE 1 3 -t_timestamp TIMESTAMP 1 4 -t_data DATE 1 5 -t_array ARRAY(INT32) 1 6 +id VARCHAR 1 parquet/int96.parquet 0 +t_bool BOOLEAN 1 parquet/int96.parquet 1 +t_float FLOAT 1 parquet/int96.parquet 2 +t_double DOUBLE 1 parquet/int96.parquet 3 +t_timestamp TIMESTAMP 1 parquet/int96.parquet 4 +t_data DATE 1 parquet/int96.parquet 5 +t_array ARRAY(INT32) 1 parquet/int96.parquet 6 # the physical type of column t_timestamp is INT96 query