From e28fe0690bcc7c83ba8643e61088b85505c8ca4c Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Mon, 18 Aug 2025 16:15:09 +0800
Subject: [PATCH 01/20] feat: `infer_schema` expands csv and ndjson support

---
 Cargo.lock                                    |  2 +
 Cargo.toml                                    |  2 +
 src/query/catalog/src/table_args.rs           |  6 ++
 src/query/service/Cargo.toml                  |  2 +
 .../infer_schema/infer_schema_table.rs        |  6 +-
 .../src/table_functions/infer_schema/mod.rs   |  2 +-
 .../infer_schema/{parquet.rs => source.rs}    | 90 +++++++++++++++++--
 .../infer_schema/table_args.rs                |  7 ++
 tests/data/csv/max_records.csv                | 11 +++
 tests/data/csv/mixed.csv                      |  4 +
 tests/data/csv/numbers_with_headers.csv       | 19 ++++
 tests/data/csv/ragged.csv                     |  5 ++
 tests/data/ndjson/max_records.ndjson          | 10 +++
 tests/data/ndjson/mixed.ndjson                |  3 +
 tests/data/ndjson/numbers.ndjson              |  3 +
 tests/data/ndjson/ragged.ndjson               |  4 +
 .../stage/formats/parquet/infer_schema.test   | 57 ++++++++++++
 17 files changed, 221 insertions(+), 12 deletions(-)
 rename src/query/service/src/table_functions/infer_schema/{parquet.rs => source.rs} (68%)
 create mode 100644 tests/data/csv/max_records.csv
 create mode 100644 tests/data/csv/mixed.csv
 create mode 100644 tests/data/csv/numbers_with_headers.csv
 create mode 100644 tests/data/csv/ragged.csv
 create mode 100644 tests/data/ndjson/max_records.ndjson
 create mode 100644 tests/data/ndjson/mixed.ndjson
 create mode 100644 tests/data/ndjson/numbers.ndjson
 create mode 100644 tests/data/ndjson/ragged.ndjson
diff --git a/Cargo.lock b/Cargo.lock
index e39580a258d30..e282242eb03ce 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5159,8 +5159,10 @@ dependencies = [
  "arrow-array",
  "arrow-buffer",
  "arrow-cast",
+ "arrow-csv",
  "arrow-flight",
  "arrow-ipc",
+ "arrow-json",
  "arrow-schema",
  "arrow-select",
  "arrow-udf-runtime",
diff --git a/Cargo.toml b/Cargo.toml
index a7036e01a4101..54a1ed641fc68 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -231,8 +231,10 @@ arrow = { version = "55" }
 arrow-array = { version = "55" }
 arrow-buffer = { version = "55" }
 arrow-cast = { version = "55", features = ["prettyprint"] }
+arrow-csv = { version = "55" }
 arrow-data = { version = "55" }
 arrow-flight = { version = "55", features = ["flight-sql-experimental", "tls"] }
+arrow-json = { version = "55" }
 arrow-ipc = { version = "55", features = ["lz4", "zstd"] }
 arrow-ord = { version = "55" }
 arrow-schema = { version = "55", features = ["serde"] }
diff --git a/src/query/catalog/src/table_args.rs b/src/query/catalog/src/table_args.rs
index dab8c366e9bad..4f1c26e1596f6 100644
--- a/src/query/catalog/src/table_args.rs
+++ b/src/query/catalog/src/table_args.rs
@@ -119,6 +119,12 @@ pub fn bool_value(value: &Scalar) -> Result<bool> {
     }
 }
 
+pub fn i64_value(value: &Scalar) -> Result<i64> {
+    value.get_i64().ok_or_else(|| {
+        ErrorCode::BadArguments(format!("invalid value {value} expect to be i64 literal."))
+    })
+}
+
 pub fn string_literal(val: &str) -> Scalar {
     Scalar::String(val.to_string())
 }
diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml
index 0016398385d49..22e0b000d2376 100644
--- a/src/query/service/Cargo.toml
+++ b/src/query/service/Cargo.toml
@@ -23,7 +23,9 @@ io-uring = [
 anyhow = { workspace = true }
 arrow-array = { workspace = true }
 arrow-buffer = { workspace = true }
+arrow-csv = { workspace = true }
 arrow-flight = { workspace = true }
+arrow-json = { workspace = true }
 arrow-ipc = { workspace = true }
 arrow-schema = { workspace = true }
 arrow-select = { workspace = true }
diff --git a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
index b9bb841f4281d..89f37eaf1aa7e 100644
--- a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
+++ b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
@@ -32,7 +32,7 @@ use databend_common_meta_app::schema::TableInfo;
 use databend_common_meta_app::schema::TableMeta;
 use databend_common_pipeline_core::Pipeline;
 
-use super::parquet::ParquetInferSchemaSource;
+use super::source::InferSchemaSource;
 use crate::sessions::TableContext;
 use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed;
 use crate::table_functions::TableFunction;
@@ -114,9 +114,7 @@ impl Table for InferSchemaTable {
         _put_cache: bool,
     ) -> Result<()> {
         pipeline.add_source(
-            |output| {
-                ParquetInferSchemaSource::create(ctx.clone(), output, self.args_parsed.clone())
-            },
+            |output| InferSchemaSource::create(ctx.clone(), output, self.args_parsed.clone()),
             1,
         )?;
         Ok(())
diff --git a/src/query/service/src/table_functions/infer_schema/mod.rs b/src/query/service/src/table_functions/infer_schema/mod.rs
index 804499bf8fa56..7bc1731b442b4 100644
--- a/src/query/service/src/table_functions/infer_schema/mod.rs
+++ b/src/query/service/src/table_functions/infer_schema/mod.rs
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 mod infer_schema_table;
-mod parquet;
+mod source;
 mod table_args;
 
 pub use infer_schema_table::InferSchemaTable;
diff --git a/src/query/service/src/table_functions/infer_schema/parquet.rs b/src/query/service/src/table_functions/infer_schema/source.rs
similarity index 68%
rename from src/query/service/src/table_functions/infer_schema/parquet.rs
rename to src/query/service/src/table_functions/infer_schema/source.rs
index 753971deab5b7..574c07bbf2322 100644
--- a/src/query/service/src/table_functions/infer_schema/parquet.rs
+++ b/src/query/service/src/table_functions/infer_schema/source.rs
@@ -13,8 +13,12 @@
 // limitations under the License.
 
 use std::collections::BTreeMap;
+use std::io::Cursor;
 use std::sync::Arc;
 
+use arrow_csv::reader::Format;
+use arrow_json::reader::infer_json_schema;
+use arrow_schema::Schema as ArrowSchema;
 use databend_common_ast::ast::FileLocation;
 use databend_common_ast::ast::UriLocation;
 use databend_common_catalog::table_context::TableContext;
@@ -26,7 +30,8 @@ use databend_common_expression::types::UInt64Type;
 use databend_common_expression::DataBlock;
 use databend_common_expression::FromData;
 use databend_common_expression::TableSchema;
-use databend_common_meta_app::principal::StageFileFormatType;
+use databend_common_meta_app::principal::CsvFileFormatParams;
+use databend_common_meta_app::principal::FileFormatParams;
 use databend_common_meta_app::principal::StageType;
 use databend_common_pipeline_core::processors::OutputPort;
 use databend_common_pipeline_core::processors::ProcessorPtr;
@@ -37,24 +42,25 @@ use databend_common_storage::init_stage_operator;
 use databend_common_storage::read_parquet_schema_async_rs;
 use databend_common_storage::StageFilesInfo;
 use databend_common_users::Object;
+use opendal::Operator;
 use opendal::Scheme;
 
 use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA;
 use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed;
 
-pub(crate) struct ParquetInferSchemaSource {
+pub(crate) struct InferSchemaSource {
     is_finished: bool,
     ctx: Arc<dyn TableContext>,
     args_parsed: InferSchemaArgsParsed,
 }
 
-impl ParquetInferSchemaSource {
+impl InferSchemaSource {
     pub fn create(
         ctx: Arc<dyn TableContext>,
         output: Arc<OutputPort>,
         args_parsed: InferSchemaArgsParsed,
     ) -> Result<ProcessorPtr> {
-        AsyncSourcer::create(ctx.clone(), output, ParquetInferSchemaSource {
+        AsyncSourcer::create(ctx.clone(), output, InferSchemaSource {
             is_finished: false,
             ctx,
             args_parsed,
@@ -63,7 +69,7 @@ impl ParquetInferSchemaSource {
 }
 
 #[async_trait::async_trait]
-impl AsyncSource for ParquetInferSchemaSource {
+impl AsyncSource for InferSchemaSource {
     const NAME: &'static str = INFER_SCHEMA;
 
     #[async_backtrace::framed]
@@ -127,9 +133,9 @@ impl AsyncSource for ParquetInferSchemaSource {
             Some(f) => self.ctx.get_file_format(f).await?,
             None => stage_info.file_format_params.clone(),
         };
-        let schema = match (first_file.as_ref(), file_format_params.get_type()) {
+        let schema = match (first_file.as_ref(), file_format_params) {
             (None, _) => return Ok(None),
-            (Some(first_file), StageFileFormatType::Parquet) => {
+            (Some(first_file), FileFormatParams::Parquet(_)) => {
                 let arrow_schema = read_parquet_schema_async_rs(
                     &operator,
                     &first_file.path,
@@ -138,6 +144,27 @@ impl AsyncSource for ParquetInferSchemaSource {
                 .await?;
                 TableSchema::try_from(&arrow_schema)?
             }
+            (Some(first_file), FileFormatParams::Csv(params)) => {
+                let arrow_schema = read_csv_metadata_async(
+                    &first_file.path,
+                    &operator,
+                    Some(first_file.size),
+                    self.args_parsed.max_records,
+                    &params,
+                )
+                .await?;
+                TableSchema::try_from(&arrow_schema)?
+            }
+            (Some(first_file), FileFormatParams::NdJson(_)) => {
+                let arrow_schema = read_json_metadata_async(
+                    &first_file.path,
+                    &operator,
+                    Some(first_file.size),
+                    self.args_parsed.max_records,
+                )
+                .await?;
+                TableSchema::try_from(&arrow_schema)?
+            }
             _ => {
                 return Err(ErrorCode::BadArguments(
                     "infer_schema is currently limited to format Parquet",
@@ -168,3 +195,52 @@ impl AsyncSource for ParquetInferSchemaSource {
         Ok(Some(block))
     }
 }
+
+pub async fn read_csv_metadata_async(
+    path: &str,
+    operator: &Operator,
+    file_size: Option<u64>,
+    max_records: Option<usize>,
+    params: &CsvFileFormatParams,
+) -> Result<ArrowSchema> {
+    let file_size = match file_size {
+        None => operator.stat(path).await?.content_length(),
+        Some(n) => n,
+    };
+    let escape = if params.escape.is_empty() {
+        None
+    } else {
+        Some(params.escape.as_bytes()[0])
+    };
+
+    // TODO: It would be better if it could be read in the form of Read trait
+    let buf = operator.read_with(path).range(..file_size).await?.to_vec();
+    let mut format = Format::default()
+        .with_delimiter(params.field_delimiter.as_bytes()[0])
+        .with_quote(params.quote.as_bytes()[0])
+        .with_header(params.headers != 0);
+
+    if let Some(escape) = escape {
+        format = format.with_escape(escape);
+    }
+    let (schema, _) = format.infer_schema(Cursor::new(&buf), max_records)?;
+
+    Ok(schema)
+}
+
+pub async fn read_json_metadata_async(
+    path: &str,
+    operator: &Operator,
+    file_size: Option<u64>,
+    max_records: Option<usize>,
+) -> Result<ArrowSchema> {
+    let file_size = match file_size {
+        None => operator.stat(path).await?.content_length(),
+        Some(n) => n,
+    };
+    // TODO: It would be better if it could be read in the form of Read trait
+    let buf = operator.read_with(path).range(..file_size).await?.to_vec();
+    let (schema, _) = infer_json_schema(Cursor::new(&buf), max_records)?;
+
+    Ok(schema)
+}
diff --git a/src/query/service/src/table_functions/infer_schema/table_args.rs b/src/query/service/src/table_functions/infer_schema/table_args.rs
index 07d359d5985a5..4bbf0ef113713 100644
--- a/src/query/service/src/table_functions/infer_schema/table_args.rs
+++ b/src/query/service/src/table_functions/infer_schema/table_args.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use databend_common_catalog::table_args::i64_value;
 use databend_common_catalog::table_args::TableArgs;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
@@ -24,6 +25,7 @@ pub(crate) struct InferSchemaArgsParsed {
     pub(crate) connection_name: Option<String>,
     pub(crate) file_format: Option<String>,
     pub(crate) files_info: StageFilesInfo,
+    pub(crate) max_records: Option<usize>,
 }
 
 impl InferSchemaArgsParsed {
@@ -38,6 +40,7 @@ impl InferSchemaArgsParsed {
             files: None,
             pattern: None,
         };
+        let mut max_records = None;
 
         for (k, v) in &args {
             match k.to_lowercase().as_str() {
@@ -53,6 +56,9 @@ impl InferSchemaArgsParsed {
                 "file_format" => {
                     file_format = Some(string_value(v)?);
                 }
+                "max_records_pre_file" => {
+                    max_records = Some(i64_value(v)? as usize);
+                }
                 _ => {
                     return Err(ErrorCode::BadArguments(format!(
                         "unknown param {} for infer_schema",
@@ -70,6 +76,7 @@ impl InferSchemaArgsParsed {
             connection_name,
             file_format,
             files_info,
+            max_records,
         })
     }
 }
diff --git a/tests/data/csv/max_records.csv b/tests/data/csv/max_records.csv
new file mode 100644
index 0000000000000..5e52f31e5dd5d
--- /dev/null
+++ b/tests/data/csv/max_records.csv
@@ -0,0 +1,11 @@
+id,value
+1,100
+2,200
+3,300
+4,400
+5,500
+6,foo
+7,bar
+8,baz
+9,qux
+10,quux
diff --git a/tests/data/csv/mixed.csv b/tests/data/csv/mixed.csv
new file mode 100644
index 0000000000000..203cdde68ced0
--- /dev/null
+++ b/tests/data/csv/mixed.csv
@@ -0,0 +1,4 @@
+id,name,score,active
+1,Alice,88.5,true
+2,Bob,92.0,false
+3,Charlie,,true
diff --git a/tests/data/csv/numbers_with_headers.csv b/tests/data/csv/numbers_with_headers.csv
new file mode 100644
index 0000000000000..85e74e0d15564
--- /dev/null
+++ b/tests/data/csv/numbers_with_headers.csv
@@ -0,0 +1,19 @@
+id,value
+0,1
+1,2
+2,3
+3,4
+4,5
+5,6
+6,7
+7,8
+8,9
+9,10
+10,11
+11,12
+12,13
+13,14
+14,15
+15,16
+16,17
+17,18
diff --git a/tests/data/csv/ragged.csv b/tests/data/csv/ragged.csv
new file mode 100644
index 0000000000000..c0cdce65d93c2
--- /dev/null
+++ b/tests/data/csv/ragged.csv
@@ -0,0 +1,5 @@
+id,value,comment
+1,10,ok
+2,20
+3,30,missing one field
+4
diff --git a/tests/data/ndjson/max_records.ndjson b/tests/data/ndjson/max_records.ndjson
new file mode 100644
index 0000000000000..079f2c82061f1
--- /dev/null
+++ b/tests/data/ndjson/max_records.ndjson
@@ -0,0 +1,10 @@
+{"id": 1, "value": 100}
+{"id": 2, "value": 200}
+{"id": 3, "value": 300}
+{"id": 4, "value": 400}
+{"id": 5, "value": 500}
+{"id": 6, "value": "foo"}
+{"id": 7, "value": "bar"}
+{"id": 8, "value": "baz"}
+{"id": 9, "value": "qux"}
+{"id": 10, "value": "quux"}
diff --git a/tests/data/ndjson/mixed.ndjson b/tests/data/ndjson/mixed.ndjson
new file mode 100644
index 0000000000000..f9c139d2f5175
--- /dev/null
+++ b/tests/data/ndjson/mixed.ndjson
@@ -0,0 +1,3 @@
+{"id": 1, "name": "Alice", "score": 88.5, "active": true}
+{"id": 2, "name": "Bob", "score": 92.0, "active": false}
+{"id": 3, "name": "Charlie", "score": null, "active": true}
diff --git a/tests/data/ndjson/numbers.ndjson b/tests/data/ndjson/numbers.ndjson
new file mode 100644
index 0000000000000..aecddc3762d07
--- /dev/null
+++ b/tests/data/ndjson/numbers.ndjson
@@ -0,0 +1,3 @@
+{"id": 1, "value": 100}
+{"id": 2, "value": 200}
+{"id": 3, "value": 300}
diff --git a/tests/data/ndjson/ragged.ndjson b/tests/data/ndjson/ragged.ndjson
new file mode 100644
index 0000000000000..847a327073c2c
--- /dev/null
+++ b/tests/data/ndjson/ragged.ndjson
@@ -0,0 +1,4 @@
+{"id": 1, "value": 10, "comment": "ok"}
+{"id": 2, "value": 20}
+{"id": 3, "value": 30, "comment": "missing one field"}
+{"id": 4}
diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index 7304db2b5d09d..4502b9c088426 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -66,3 +66,60 @@ select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parqu
 ----
 id INT 0 0
 t TUPLE(A INT32, B STRING) 0 1
+
+# CSV
+statement ok
+create or replace file format head_csv_format type = 'CSV' field_delimiter = ',' skip_header = 1;
+
+query TTBI
+select * from infer_schema(location => '@data/csv/numbers_with_headers.csv', file_format => 'CSV');
+----
+column_1 VARCHAR 1 0
+column_2 VARCHAR 1 1
+
+query TTBI
+select * from infer_schema(location => '@data/csv/numbers_with_headers.csv', file_format => 'head_csv_format');
+----
+id BIGINT 1 0
+value BIGINT 1 1
+
+statement error
+select * from infer_schema(location => '@data/csv/ragged.csv', file_format => 'head_csv_format');
+
+query TTBI
+select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format');
+----
+id BIGINT 1 0
+value VARCHAR 1 1
+
+query TTBI
+select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5);
+----
+id BIGINT 1 0
+value BIGINT 1 1
+
+# NDJSON
+query TTBI
+select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON');
+----
+id BIGINT 1 0
+value BIGINT 1 1
+
+query TTBI
+select * from infer_schema(location => '@data/ndjson/ragged.ndjson', file_format => 'NDJSON');
+----
+id BIGINT 1 0
+value BIGINT 1 1
+comment VARCHAR 1 2
+
+query TTBI
+select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON');
+----
+id BIGINT 1 0
+value VARCHAR 1 1
+
+query TTBI
+select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5);
+----
+id BIGINT 1 0
+value BIGINT 1 1

From f39edd1e30ce80e54001c1de499063486abc8ad2 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Mon, 18 Aug 2025 16:31:15 +0800
Subject: [PATCH 02/20] chore: codefmt

---
 Cargo.toml                   | 2 +-
 src/query/service/Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 54a1ed641fc68..19b0f0017a71b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -234,8 +234,8 @@ arrow-cast = { version = "55", features = ["prettyprint"] }
 arrow-csv = { version = "55" }
 arrow-data = { version = "55" }
 arrow-flight = { version = "55", features = ["flight-sql-experimental", "tls"] }
-arrow-json = { version = "55" }
 arrow-ipc = { version = "55", features = ["lz4", "zstd"] }
+arrow-json = { version = "55" }
 arrow-ord = { version = "55" }
 arrow-schema = { version = "55", features = ["serde"] }
 arrow-select = { version = "55" }
diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml
index 22e0b000d2376..2bc06f9d0973f 100644
--- a/src/query/service/Cargo.toml
+++ b/src/query/service/Cargo.toml
@@ -25,8 +25,8 @@ arrow-array = { workspace = true }
 arrow-buffer = { workspace = true }
 arrow-csv = { workspace = true }
 arrow-flight = { workspace = true }
-arrow-json = { workspace = true }
 arrow-ipc = { workspace = true }
+arrow-json = { workspace = true }
 arrow-schema = { workspace = true }
 arrow-select = { workspace = true }
 arrow-udf-runtime = { workspace = true }

From cc4d62eca6c3e059532699b54d02b0b445adab3a Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Tue, 19 Aug 2025 12:57:06 +0800
Subject: [PATCH 03/20] chore: add check on csv and ndjson compression

---
 .../table_functions/infer_schema/source.rs    | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs
index 574c07bbf2322..8f1fb795530cb 100644
--- a/src/query/service/src/table_functions/infer_schema/source.rs
+++ b/src/query/service/src/table_functions/infer_schema/source.rs
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 use std::collections::BTreeMap;
-use std::io::Cursor;
 use std::sync::Arc;
 
 use arrow_csv::reader::Format;
@@ -32,6 +31,7 @@ use databend_common_expression::FromData;
 use databend_common_expression::TableSchema;
 use databend_common_meta_app::principal::CsvFileFormatParams;
 use databend_common_meta_app::principal::FileFormatParams;
+use databend_common_meta_app::principal::StageFileCompression;
 use databend_common_meta_app::principal::StageType;
 use databend_common_pipeline_core::processors::OutputPort;
 use databend_common_pipeline_core::processors::ProcessorPtr;
@@ -145,6 +145,11 @@ impl AsyncSource for InferSchemaSource {
                 TableSchema::try_from(&arrow_schema)?
             }
             (Some(first_file), FileFormatParams::Csv(params)) => {
+                if params.compression != StageFileCompression::None {
+                    return Err(ErrorCode::InvalidCompressionData(
+                        "Compressed CSV files are not supported",
+                    ));
+                }
                 let arrow_schema = read_csv_metadata_async(
                     &first_file.path,
                     &operator,
@@ -155,7 +160,12 @@ impl AsyncSource for InferSchemaSource {
                 .await?;
                 TableSchema::try_from(&arrow_schema)?
             }
-            (Some(first_file), FileFormatParams::NdJson(_)) => {
+            (Some(first_file), FileFormatParams::NdJson(params)) => {
+                if params.compression != StageFileCompression::None {
+                    return Err(ErrorCode::InvalidCompressionData(
+                        "Compressed NDJSON files are not supported",
+                    ));
+                }
                 let arrow_schema = read_json_metadata_async(
                     &first_file.path,
                     &operator,
@@ -167,7 +177,7 @@ impl AsyncSource for InferSchemaSource {
             }
             _ => {
                 return Err(ErrorCode::BadArguments(
-                    "infer_schema is currently limited to format Parquet",
+                    "infer_schema is currently limited to format Parquet, CSV and NDJSON",
                 ));
             }
         };
@@ -214,7 +224,7 @@ pub async fn read_csv_metadata_async(
     };
 
     // TODO: It would be better if it could be read in the form of Read trait
-    let buf = operator.read_with(path).range(..file_size).await?.to_vec();
+    let buf = operator.read_with(path).range(..file_size).await?;
     let mut format = Format::default()
         .with_delimiter(params.field_delimiter.as_bytes()[0])
         .with_quote(params.quote.as_bytes()[0])
@@ -223,7 +233,7 @@ pub async fn read_csv_metadata_async(
     if let Some(escape) = escape {
         format = format.with_escape(escape);
     }
-    let (schema, _) = format.infer_schema(Cursor::new(&buf), max_records)?;
+    let (schema, _) = format.infer_schema(buf, max_records)?;
 
     Ok(schema)
 }
@@ -239,8 +249,8 @@ pub async fn read_json_metadata_async(
         Some(n) => n,
     };
     // TODO: It would be better if it could be read in the form of Read trait
-    let buf = operator.read_with(path).range(..file_size).await?.to_vec();
-    let (schema, _) = infer_json_schema(Cursor::new(&buf), max_records)?;
+    let buf = operator.read_with(path).range(..file_size).await?;
+    let (schema, _) = infer_json_schema(buf, max_records)?;
 
     Ok(schema)
 }

From 1481aa69ca0132a72874c982a497286893f9a712 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Tue, 19 Aug 2025 17:52:12 +0800
Subject: [PATCH 04/20] chore: add `max_bytes`

---
 .../table_functions/infer_schema/source.rs    | 15 ++++++++----
 .../infer_schema/table_args.rs                |  6 +++++
 .../stage/formats/parquet/infer_schema.test   | 24 +++++++++++++++++++
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs
index 8f1fb795530cb..b0ad8663803aa 100644
--- a/src/query/service/src/table_functions/infer_schema/source.rs
+++ b/src/query/service/src/table_functions/infer_schema/source.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::cmp;
 use std::collections::BTreeMap;
 use std::sync::Arc;
 
@@ -48,6 +49,8 @@ use opendal::Scheme;
 use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA;
 use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed;
 
+const DEFAULT_MAX_BYTES: u64 = 1 * 1024 * 1024;
+
 pub(crate) struct InferSchemaSource {
     is_finished: bool,
     ctx: Arc<dyn TableContext>,
@@ -154,6 +157,7 @@ impl AsyncSource for InferSchemaSource {
                     &first_file.path,
                     &operator,
                     Some(first_file.size),
+                    self.args_parsed.max_bytes,
                     self.args_parsed.max_records,
                     &params,
                 )
@@ -170,6 +174,7 @@ impl AsyncSource for InferSchemaSource {
                     &first_file.path,
                     &operator,
                     Some(first_file.size),
+                    self.args_parsed.max_bytes,
                     self.args_parsed.max_records,
                 )
                 .await?;
@@ -210,6 +215,7 @@ pub async fn read_csv_metadata_async(
     path: &str,
     operator: &Operator,
     file_size: Option<u64>,
+    max_bytes: Option<u64>,
     max_records: Option<usize>,
     params: &CsvFileFormatParams,
 ) -> Result<ArrowSchema> {
@@ -223,8 +229,8 @@ pub async fn read_csv_metadata_async(
         Some(params.escape.as_bytes()[0])
     };
 
-    // TODO: It would be better if it could be read in the form of Read trait
-    let buf = operator.read_with(path).range(..file_size).await?;
+    let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size);
+    let buf = operator.read_with(path).range(..bytes_len).await?;
     let mut format = Format::default()
         .with_delimiter(params.field_delimiter.as_bytes()[0])
         .with_quote(params.quote.as_bytes()[0])
@@ -242,14 +248,15 @@ pub async fn read_json_metadata_async(
     path: &str,
     operator: &Operator,
     file_size: Option<u64>,
+    max_bytes: Option<u64>,
     max_records: Option<usize>,
 ) -> Result<ArrowSchema> {
     let file_size = match file_size {
         None => operator.stat(path).await?.content_length(),
         Some(n) => n,
     };
-    // TODO: It would be better if it could be read in the form of Read trait
-    let buf = operator.read_with(path).range(..file_size).await?;
+    let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size);
+    let buf = operator.read_with(path).range(..bytes_len).await?;
     let (schema, _) = infer_json_schema(buf, max_records)?;
 
     Ok(schema)
diff --git a/src/query/service/src/table_functions/infer_schema/table_args.rs b/src/query/service/src/table_functions/infer_schema/table_args.rs
index 4bbf0ef113713..902807c75b580 100644
--- a/src/query/service/src/table_functions/infer_schema/table_args.rs
+++ b/src/query/service/src/table_functions/infer_schema/table_args.rs
@@ -26,6 +26,7 @@ pub(crate) struct InferSchemaArgsParsed {
     pub(crate) file_format: Option<String>,
     pub(crate) files_info: StageFilesInfo,
     pub(crate) max_records: Option<usize>,
+    pub(crate) max_bytes: Option<u64>,
 }
 
 impl InferSchemaArgsParsed {
@@ -41,6 +42,7 @@ impl InferSchemaArgsParsed {
             pattern: None,
         };
         let mut max_records = None;
+        let mut max_bytes = None;
 
         for (k, v) in &args {
             match k.to_lowercase().as_str() {
@@ -59,6 +61,9 @@ impl InferSchemaArgsParsed {
                 "max_records_pre_file" => {
                     max_records = Some(i64_value(v)? as usize);
                 }
+                "max_bytes" => {
+                    max_bytes = Some(i64_value(v)? as u64);
+                }
                 _ => {
                     return Err(ErrorCode::BadArguments(format!(
                         "unknown param {} for infer_schema",
@@ -77,6 +82,7 @@ impl InferSchemaArgsParsed {
             file_format,
             files_info,
             max_records,
+            max_bytes,
         })
     }
 }
diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index 4502b9c088426..5ddb5f48152b7 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -98,6 +98,18 @@ select * from infer_schema(location => '@data/csv/max_records.csv', file_format
 id BIGINT 1 0
 value BIGINT 1 1
 
+# max_records.csv is 71 bytes
+# enough bytes
+query TTBI
+select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 15);
+----
+id BIGINT 1 0
+value BIGINT 1 1
+
+# not enough bytes
+statement error
+select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 10);
+
 # NDJSON
 query TTBI
 select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON');
@@ -123,3 +135,15 @@ select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_f
 ----
 id BIGINT 1 0
 value BIGINT 1 1
+
+# max_records.csv is 252 bytes
+# enough bytes
+query TTBI
+select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 130);
+----
+id BIGINT 1 0
+value BIGINT 1 1
+
+# not enough bytes
+statement error
+select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 50);

From 54ed2086e2ee4ac8925572bcf2f21623b9bd6c13 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Wed, 20 Aug 2025 11:57:04 +0800
Subject: [PATCH 05/20] feat: support compressed files for infer_schema csv
 ndjson

---
 Cargo.lock                                    |   1 +
 src/query/service/Cargo.toml                  |   1 +
 .../table_functions/infer_schema/source.rs    |  38 +++++++++++-------
 tests/data/csv/max_records.csv.zst            | Bin 0 -> 76 bytes
 tests/data/csv/max_records.zip                | Bin 0 -> 271 bytes
 tests/data/ndjson/max_records.ndjson.zst      | Bin 0 -> 110 bytes
 tests/data/ndjson/max_records.zip             | Bin 0 -> 302 bytes
 .../stage/formats/parquet/infer_schema.test   |  24 +++++++++++
 8 files changed, 49 insertions(+), 15 deletions(-)
 create mode 100644 tests/data/csv/max_records.csv.zst
 create mode 100644 tests/data/csv/max_records.zip
 create mode 100644 tests/data/ndjson/max_records.ndjson.zst
 create mode 100644 tests/data/ndjson/max_records.zip

diff --git a/Cargo.lock b/Cargo.lock
index e282242eb03ce..d72ee7aca19a3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5193,6 +5193,7 @@ dependencies = [
  "databend-common-catalog",
  "databend-common-cloud-control",
  "databend-common-column",
+ "databend-common-compress",
  "databend-common-config",
  "databend-common-exception",
  "databend-common-expression",
diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml
index 2bc06f9d0973f..f6b0302b314ea 100644
--- a/src/query/service/Cargo.toml
+++ b/src/query/service/Cargo.toml
@@ -56,6 +56,7 @@ databend-common-cache = { workspace = true }
 databend-common-catalog = { workspace = true }
 databend-common-cloud-control = { workspace = true }
 databend-common-column = { workspace = true }
+databend-common-compress = { workspace = true }
 databend-common-config = { workspace = true }
 databend-common-exception = { workspace = true }
 databend-common-expression = { workspace = true }
diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs
index b0ad8663803aa..9868930da2ad1 100644
--- a/src/query/service/src/table_functions/infer_schema/source.rs
+++ b/src/query/service/src/table_functions/infer_schema/source.rs
@@ -14,6 +14,7 @@
 
 use std::cmp;
 use std::collections::BTreeMap;
+use std::io::Cursor;
 use std::sync::Arc;
 
 use arrow_csv::reader::Format;
@@ -22,6 +23,8 @@ use arrow_schema::Schema as ArrowSchema;
 use databend_common_ast::ast::FileLocation;
 use databend_common_ast::ast::UriLocation;
 use databend_common_catalog::table_context::TableContext;
+use databend_common_compress::CompressAlgorithm;
+use databend_common_compress::DecompressDecoder;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::types::BooleanType;
@@ -32,7 +35,6 @@ use databend_common_expression::FromData;
 use databend_common_expression::TableSchema;
 use databend_common_meta_app::principal::CsvFileFormatParams;
 use databend_common_meta_app::principal::FileFormatParams;
-use databend_common_meta_app::principal::StageFileCompression;
 use databend_common_meta_app::principal::StageType;
 use databend_common_pipeline_core::processors::OutputPort;
 use databend_common_pipeline_core::processors::ProcessorPtr;
@@ -148,11 +150,6 @@ impl AsyncSource for InferSchemaSource {
                 TableSchema::try_from(&arrow_schema)?
             }
             (Some(first_file), FileFormatParams::Csv(params)) => {
-                if params.compression != StageFileCompression::None {
-                    return Err(ErrorCode::InvalidCompressionData(
-                        "Compressed CSV files are not supported",
-                    ));
-                }
                 let arrow_schema = read_csv_metadata_async(
                     &first_file.path,
                     &operator,
@@ -165,11 +162,6 @@ impl AsyncSource for InferSchemaSource {
                 TableSchema::try_from(&arrow_schema)?
             }
             (Some(first_file), FileFormatParams::NdJson(params)) => {
-                if params.compression != StageFileCompression::None {
-                    return Err(ErrorCode::InvalidCompressionData(
-                        "Compressed NDJSON files are not supported",
-                    ));
-                }
                 let arrow_schema = read_json_metadata_async(
                     &first_file.path,
                     &operator,
@@ -230,7 +222,15 @@ pub async fn read_csv_metadata_async(
     };
 
     let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size);
-    let buf = operator.read_with(path).range(..bytes_len).await?;
+    let mut buf = operator.read_with(path).range(..bytes_len).await?.to_vec();
+
+    if let Some(algo) = CompressAlgorithm::from_path(path) {
+        buf = if CompressAlgorithm::Zip == algo {
+            DecompressDecoder::decompress_all_zip(&buf)?
+        } else {
+            DecompressDecoder::new(algo).decompress_batch(&buf)?
+        };
+    }
     let mut format = Format::default()
         .with_delimiter(params.field_delimiter.as_bytes()[0])
         .with_quote(params.quote.as_bytes()[0])
@@ -239,7 +239,7 @@ pub async fn read_csv_metadata_async(
     if let Some(escape) = escape {
         format = format.with_escape(escape);
     }
-    let (schema, _) = format.infer_schema(buf, max_records)?;
+    let (schema, _) = format.infer_schema(Cursor::new(buf), max_records)?;
 
     Ok(schema)
 }
@@ -256,8 +256,16 @@ pub async fn read_json_metadata_async(
         Some(n) => n,
     };
     let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size);
-    let buf = operator.read_with(path).range(..bytes_len).await?;
-    let (schema, _) = infer_json_schema(buf, max_records)?;
+    let mut buf = operator.read_with(path).range(..bytes_len).await?.to_vec();
+
+    if let Some(algo) = CompressAlgorithm::from_path(path) {
+        buf = if CompressAlgorithm::Zip == algo {
+            DecompressDecoder::decompress_all_zip(&buf)?
+        } else {
+            DecompressDecoder::new(algo).decompress_batch(&buf)?
+        };
+    }
+    let (schema, _) = infer_json_schema(Cursor::new(buf), max_records)?;
 
     Ok(schema)
 }
diff --git a/tests/data/csv/max_records.csv.zst b/tests/data/csv/max_records.csv.zst
new file mode 100644
index 0000000000000000000000000000000000000000..ef35edae5da5e34bef543852e4758960dd95b64d
GIT binary patch
literal 76
zcmV-S0JHxnwJ-f7NBsc+a>Nc5pg9Lb2+(R|Kj;4dYd<a7X#cBc2yL0nzqVu^x}dVq
i1eJrXWf<t6ocbf9{>P`kvFU%l7Kxx@;s^kPwi0zWnk5JT

literal 0
HcmV?d00001

diff --git a/tests/data/csv/max_records.zip b/tests/data/csv/max_records.zip
new file mode 100644
index 0000000000000000000000000000000000000000..baea0be135d7f49854fec8335da2e6fd7f5a6a43
GIT binary patch
literal 271
zcmWIWW@Zs#-~d9SV3BABC~ybT{0s^Vxrr6=MXAa8MJdI4$;D-%A-oLi3PwvaV7Rn`
zn}Lz#1v3K!m^kfoQPca3rteduD~2Wp4~#AuGEF*Wb<E+C%W)QuS+8a=c^c2%@~1|x
zc<z!TN_yN1kw;Tk21i64OwC|e(qO_6;LXk<^k&&7d!X$g7X)}SGKnxFT!$<Navcn8
ZX#}xIbZdY&D;r1~BM`;{={X<{0|3P~MEU>#

literal 0
HcmV?d00001

diff --git a/tests/data/ndjson/max_records.ndjson.zst b/tests/data/ndjson/max_records.ndjson.zst
new file mode 100644
index 0000000000000000000000000000000000000000..77821a433bd6ffc639e16213f27f76bd13c023b1
GIT binary patch
literal 110
zcmV-!0FnPFwJ-f7{0##DbOU=LX=EZgATcZ;B6eYHb!9LxeF`!%Gcz<aH8nOOW^ZpI
zHzHzTayWW9admhxFd}hvb$B9u3KIZiV=&kT@B<qIQuKrb@%BM0Fbo(0=73?q2rvf>
Q14e+kJCnFomeSvl>ovS2$^ZZW

literal 0
HcmV?d00001

diff --git a/tests/data/ndjson/max_records.zip b/tests/data/ndjson/max_records.zip
new file mode 100644
index 0000000000000000000000000000000000000000..02da2fa12d2064ef2b8a433cda8806c76ebd8446
GIT binary patch
literal 302
zcmWIWW@Zs#-~d9SV3BABDEI@Ug%}hVauX}!i&B&Gi&Bd9@=~&j^YcPOcp2CgjFx7=
zaA^fM10%}|W(Ec@u{!LOPvE8?gDZgnTHa?geOWj1@?`B?rTECr1B8x&k($RNAX)}S
zX&#TfGJpQ`Dv&HrnYrYMQcDl(Dj!j&V?vS3Hf&*1OwMS~Vx7j!5a7+uQMKrXa0Jk0
qAlC(WGct)VBV3Lw2XZ+KY-t3sNcTs8H!B-RCnFFh0O=hd4g&x_E>wL0

literal 0
HcmV?d00001

diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index 5ddb5f48152b7..e3a1a99748098 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -98,6 +98,18 @@ select * from infer_schema(location => '@data/csv/max_records.csv', file_format
 id BIGINT 1 0
 value BIGINT 1 1
 
+query TTBI
+select * from infer_schema(location => '@data/csv/max_records.zip', file_format => 'head_csv_format', max_records_pre_file => 5);
+----
+id BIGINT 1 0
+value BIGINT 1 1
+
+query TTBI
+select * from infer_schema(location => '@data/csv/max_records.csv.zst', file_format => 'head_csv_format', max_records_pre_file => 5);
+----
+id BIGINT 1 0
+value BIGINT 1 1
+
 # max_records.csv is 71 bytes
 # enough bytes
 query TTBI
@@ -136,6 +148,18 @@ select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_f
 id BIGINT 1 0
 value BIGINT 1 1
 
+query TTBI
+select * from infer_schema(location => '@data/ndjson/max_records.zip', file_format => 'NDJSON', max_records_pre_file => 5);
+----
+id BIGINT 1 0
+value BIGINT 1 1
+
+query TTBI
+select * from infer_schema(location => '@data/ndjson/max_records.ndjson.zst', file_format => 'NDJSON', max_records_pre_file => 5);
+----
+id BIGINT 1 0
+value BIGINT 1 1
+
 # max_records.csv is 252 bytes
 # enough bytes
 query TTBI

From 7075934392e6201bc7f2e99ed6997898527bb7b8 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Wed, 20 Aug 2025 13:32:15 +0800
Subject: [PATCH 06/20] chore: add xz on `infer_schema.test`

---
 tests/data/csv/max_records.csv.xz                 | Bin 0 -> 124 bytes
 tests/data/ndjson/max_records.ndjson.xz           | Bin 0 -> 144 bytes
 .../stage/formats/parquet/infer_schema.test       |  12 ++++++++++++
 3 files changed, 12 insertions(+)
 create mode 100644 tests/data/csv/max_records.csv.xz
 create mode 100644 tests/data/ndjson/max_records.ndjson.xz

diff --git a/tests/data/csv/max_records.csv.xz b/tests/data/csv/max_records.csv.xz
new file mode 100644
index 0000000000000000000000000000000000000000..25a16f4f85295057ef5da488e59b641fef51112c
GIT binary patch
literal 124
zcmexsUKJ6=z`*kC+7>q^21Q0O1_p)_{ill`Ft{<;#xj`9WNc$Bdg{8m_}X{(nI;S(
z*Nnuv7b!o8-kEZ=sdP_;jZ|l=^a7!mOk!O8C--kJlRrOG!=rANLiXB2KMnwmV(_&I
c=`7>YTF<~3>CW2jzF2<S=UPUP1WRNT02_ELjsO4v

literal 0
HcmV?d00001

diff --git a/tests/data/ndjson/max_records.ndjson.xz b/tests/data/ndjson/max_records.ndjson.xz
new file mode 100644
index 0000000000000000000000000000000000000000..841c8cf053dfd1473a1c45409303aac3eb25eb2c
GIT binary patch
literal 144
zcmexsUKJ6=z`*kC+7>q^21Q0O1_p)_{ill`F#Klljb*Uy=u~T*99&;2z*eBul+>NN
za*p`vFW;vs23BSno>#lz&y$*y>v^Woo%`_Y7j1C{X7~3tUz4fcc4F!Qw!|&_+46VP
wU-%&!5jyo~;}@XG3=FS*QzLU!J8BsiGygCG`DFpuZf-o1{+kIT$r2d_06+LSfdBvi

literal 0
HcmV?d00001

diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index e3a1a99748098..4ed7b1dc50ef2 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -118,6 +118,12 @@ select * from infer_schema(location => '@data/csv/max_records.csv', file_format
 id BIGINT 1 0
 value BIGINT 1 1
 
+query TTBI
+select * from infer_schema(location => '@data/csv/max_records.csv.xz', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 70);
+----
+id BIGINT 1 0
+value BIGINT 1 1
+
 # not enough bytes
 statement error
 select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 10);
@@ -168,6 +174,12 @@ select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_f
 id BIGINT 1 0
 value BIGINT 1 1
 
+query TTBI
+select * from infer_schema(location => '@data/ndjson/max_records.ndjson.xz', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 80)
+----
+id BIGINT 1 0
+value BIGINT 1 1
+
 # not enough bytes
 statement error
 select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 50);

From 7ef9f88cbb15c542c46347cfca2bc1a0be93d7f2 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Wed, 20 Aug 2025 13:35:38 +0800
Subject: [PATCH 07/20] chore: codefmt

---
 src/query/service/src/table_functions/infer_schema/source.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs
index 9868930da2ad1..aa0bf0096a521 100644
--- a/src/query/service/src/table_functions/infer_schema/source.rs
+++ b/src/query/service/src/table_functions/infer_schema/source.rs
@@ -51,7 +51,7 @@ use opendal::Scheme;
 use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA;
 use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed;
 
-const DEFAULT_MAX_BYTES: u64 = 1 * 1024 * 1024;
+const DEFAULT_MAX_BYTES: u64 = 1024 * 1024;
 
 pub(crate) struct InferSchemaSource {
     is_finished: bool,
@@ -161,7 +161,7 @@ impl AsyncSource for InferSchemaSource {
                 .await?;
                 TableSchema::try_from(&arrow_schema)?
             }
-            (Some(first_file), FileFormatParams::NdJson(params)) => {
+            (Some(first_file), FileFormatParams::NdJson(_)) => {
                 let arrow_schema = read_json_metadata_async(
                     &first_file.path,
                     &operator,

From 69dbbd3d015cdb38d589f2b3a9f3b4d50937cf31 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Wed, 20 Aug 2025 16:59:54 +0800
Subject: [PATCH 08/20] feat(infer_schema): remove max_bytes and automatically
 infer the length when max_records is present

---
 .../table_functions/infer_schema/source.rs    | 114 +++++++++---------
 .../infer_schema/table_args.rs                |   6 -
 .../stage/formats/parquet/infer_schema.test   |  16 +--
 3 files changed, 64 insertions(+), 72 deletions(-)

diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs
index aa0bf0096a521..b4d20c22386da 100644
--- a/src/query/service/src/table_functions/infer_schema/source.rs
+++ b/src/query/service/src/table_functions/infer_schema/source.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::borrow::Cow;
 use std::cmp;
 use std::collections::BTreeMap;
 use std::io::Cursor;
@@ -19,7 +20,9 @@ use std::sync::Arc;
 
 use arrow_csv::reader::Format;
 use arrow_json::reader::infer_json_schema;
+use arrow_schema::ArrowError;
 use arrow_schema::Schema as ArrowSchema;
+use bytes::BufMut;
 use databend_common_ast::ast::FileLocation;
 use databend_common_ast::ast::UriLocation;
 use databend_common_catalog::table_context::TableContext;
@@ -33,7 +36,6 @@ use databend_common_expression::types::UInt64Type;
 use databend_common_expression::DataBlock;
 use databend_common_expression::FromData;
 use databend_common_expression::TableSchema;
-use databend_common_meta_app::principal::CsvFileFormatParams;
 use databend_common_meta_app::principal::FileFormatParams;
 use databend_common_meta_app::principal::StageType;
 use databend_common_pipeline_core::processors::OutputPort;
@@ -51,7 +53,7 @@ use opendal::Scheme;
 use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA;
 use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed;
 
-const DEFAULT_MAX_BYTES: u64 = 1024 * 1024;
+const DEFAULT_BYTES: u64 = 20;
 
 pub(crate) struct InferSchemaSource {
     is_finished: bool,
@@ -150,24 +152,37 @@ impl AsyncSource for InferSchemaSource {
                 TableSchema::try_from(&arrow_schema)?
             }
             (Some(first_file), FileFormatParams::Csv(params)) => {
-                let arrow_schema = read_csv_metadata_async(
+                let escape = if params.escape.is_empty() {
+                    None
+                } else {
+                    Some(params.escape.as_bytes()[0])
+                };
+
+                let mut format = Format::default()
+                    .with_delimiter(params.field_delimiter.as_bytes()[0])
+                    .with_quote(params.quote.as_bytes()[0])
+                    .with_header(params.headers != 0);
+                if let Some(escape) = escape {
+                    format = format.with_escape(escape);
+                }
+
+                let arrow_schema = read_metadata_async(
                     &first_file.path,
                     &operator,
                     Some(first_file.size),
-                    self.args_parsed.max_bytes,
                     self.args_parsed.max_records,
-                    &params,
+                    |reader, max_record| format.infer_schema(reader, max_record),
                 )
                 .await?;
                 TableSchema::try_from(&arrow_schema)?
             }
             (Some(first_file), FileFormatParams::NdJson(_)) => {
-                let arrow_schema = read_json_metadata_async(
+                let arrow_schema = read_metadata_async(
                     &first_file.path,
                     &operator,
                     Some(first_file.size),
-                    self.args_parsed.max_bytes,
                     self.args_parsed.max_records,
+                    |reader, max_record| infer_json_schema(reader, max_record),
                 )
                 .await?;
                 TableSchema::try_from(&arrow_schema)?
@@ -203,69 +218,60 @@ impl AsyncSource for InferSchemaSource {
     }
 }
 
-pub async fn read_csv_metadata_async(
+pub async fn read_metadata_async<
+    F: Fn(Cursor<&[u8]>, Option<usize>) -> std::result::Result<(ArrowSchema, usize), ArrowError>,
+>(
     path: &str,
     operator: &Operator,
     file_size: Option<u64>,
-    max_bytes: Option<u64>,
     max_records: Option<usize>,
-    params: &CsvFileFormatParams,
+    func_infer_schema: F,
 ) -> Result<ArrowSchema> {
     let file_size = match file_size {
         None => operator.stat(path).await?.content_length(),
         Some(n) => n,
     };
-    let escape = if params.escape.is_empty() {
-        None
-    } else {
-        Some(params.escape.as_bytes()[0])
-    };
-
-    let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size);
-    let mut buf = operator.read_with(path).range(..bytes_len).await?.to_vec();
-
-    if let Some(algo) = CompressAlgorithm::from_path(path) {
-        buf = if CompressAlgorithm::Zip == algo {
-            DecompressDecoder::decompress_all_zip(&buf)?
+    let algo = CompressAlgorithm::from_path(path);
+    let mut buf = Vec::new();
+    let mut offset: u64 = 0;
+    let mut chunk_size: u64 =
+        if max_records.is_none() || matches!(algo, Some(CompressAlgorithm::Zip)) {
+            file_size
         } else {
-            DecompressDecoder::new(algo).decompress_batch(&buf)?
+            DEFAULT_BYTES
         };
-    }
-    let mut format = Format::default()
-        .with_delimiter(params.field_delimiter.as_bytes()[0])
-        .with_quote(params.quote.as_bytes()[0])
-        .with_header(params.headers != 0);
 
-    if let Some(escape) = escape {
-        format = format.with_escape(escape);
-    }
-    let (schema, _) = format.infer_schema(Cursor::new(buf), max_records)?;
+    loop {
+        let end = cmp::min(offset + chunk_size, file_size);
 
-    Ok(schema)
-}
+        let chunk = operator.read_with(path).range(offset..end).await?;
+        buf.put(chunk);
 
-pub async fn read_json_metadata_async(
-    path: &str,
-    operator: &Operator,
-    file_size: Option<u64>,
-    max_bytes: Option<u64>,
-    max_records: Option<usize>,
-) -> Result<ArrowSchema> {
-    let file_size = match file_size {
-        None => operator.stat(path).await?.content_length(),
-        Some(n) => n,
-    };
-    let bytes_len = cmp::min(max_bytes.unwrap_or(DEFAULT_MAX_BYTES), file_size);
-    let mut buf = operator.read_with(path).range(..bytes_len).await?.to_vec();
+        offset = end;
 
-    if let Some(algo) = CompressAlgorithm::from_path(path) {
-        buf = if CompressAlgorithm::Zip == algo {
-            DecompressDecoder::decompress_all_zip(&buf)?
+        let bytes = if let Some(algo) = algo {
+            let decompress_bytes = if CompressAlgorithm::Zip == algo {
+                DecompressDecoder::decompress_all_zip(&buf)?
+            } else {
+                DecompressDecoder::new(algo).decompress_batch(&buf)?
+            };
+            Cow::Owned(decompress_bytes)
         } else {
-            DecompressDecoder::new(algo).decompress_batch(&buf)?
+            Cow::Borrowed(&buf)
         };
-    }
-    let (schema, _) = infer_json_schema(Cursor::new(buf), max_records)?;
 
-    Ok(schema)
+        if !bytes.is_empty() || offset >= file_size {
+            match func_infer_schema(Cursor::new(bytes.as_slice()), max_records) {
+                Ok((schema, _)) => {
+                    return Ok(schema);
+                }
+                Err(err) => {
+                    if offset >= file_size {
+                        return Err(ErrorCode::from(err));
+                    }
+                }
+            }
+        }
+        chunk_size = cmp::min(chunk_size * 2, file_size - offset);
+    }
 }
diff --git a/src/query/service/src/table_functions/infer_schema/table_args.rs b/src/query/service/src/table_functions/infer_schema/table_args.rs
index 902807c75b580..4bbf0ef113713 100644
--- a/src/query/service/src/table_functions/infer_schema/table_args.rs
+++ b/src/query/service/src/table_functions/infer_schema/table_args.rs
@@ -26,7 +26,6 @@ pub(crate) struct InferSchemaArgsParsed {
     pub(crate) file_format: Option<String>,
     pub(crate) files_info: StageFilesInfo,
     pub(crate) max_records: Option<usize>,
-    pub(crate) max_bytes: Option<u64>,
 }
 
 impl InferSchemaArgsParsed {
@@ -42,7 +41,6 @@ impl InferSchemaArgsParsed {
             pattern: None,
         };
         let mut max_records = None;
-        let mut max_bytes = None;
 
         for (k, v) in &args {
             match k.to_lowercase().as_str() {
@@ -61,9 +59,6 @@ impl InferSchemaArgsParsed {
                 "max_records_pre_file" => {
                     max_records = Some(i64_value(v)? as usize);
                 }
-                "max_bytes" => {
-                    max_bytes = Some(i64_value(v)? as u64);
-                }
                 _ => {
                     return Err(ErrorCode::BadArguments(format!(
                         "unknown param {} for infer_schema",
@@ -82,7 +77,6 @@ impl InferSchemaArgsParsed {
             file_format,
             files_info,
             max_records,
-            max_bytes,
         })
     }
 }
diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index 4ed7b1dc50ef2..2f8b495f5f9d7 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -113,21 +113,17 @@ value BIGINT 1 1
 # max_records.csv is 71 bytes
 # enough bytes
 query TTBI
-select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 15);
+select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5);
 ----
 id BIGINT 1 0
 value BIGINT 1 1
 
 query TTBI
-select * from infer_schema(location => '@data/csv/max_records.csv.xz', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 70);
+select * from infer_schema(location => '@data/csv/max_records.csv.xz', file_format => 'head_csv_format', max_records_pre_file => 5);
 ----
 id BIGINT 1 0
 value BIGINT 1 1
 
-# not enough bytes
-statement error
-select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5, max_bytes => 10);
-
 # NDJSON
 query TTBI
 select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON');
@@ -169,17 +165,13 @@ value BIGINT 1 1
 # max_records.csv is 252 bytes
 # enough bytes
 query TTBI
-select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 130);
+select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5);
 ----
 id BIGINT 1 0
 value BIGINT 1 1
 
 query TTBI
-select * from infer_schema(location => '@data/ndjson/max_records.ndjson.xz', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 80)
+select * from infer_schema(location => '@data/ndjson/max_records.ndjson.xz', file_format => 'NDJSON', max_records_pre_file => 5)
 ----
 id BIGINT 1 0
 value BIGINT 1 1
-
-# not enough bytes
-statement error
-select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5, max_bytes => 50);

From 684918cdc2be0477d739c52055cf4baf314ca03d Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Wed, 20 Aug 2025 21:56:41 +0800
Subject: [PATCH 09/20] test: add more type test for infer_schema

---
 .../table_functions/infer_schema/source.rs    |  6 ++--
 tests/data/csv/types.csv                      |  4 +++
 tests/data/ndjson/types.ndjson                |  3 ++
 .../stage/formats/parquet/infer_schema.test   | 30 ++++++++++++++++---
 4 files changed, 37 insertions(+), 6 deletions(-)
 create mode 100644 tests/data/csv/types.csv
 create mode 100644 tests/data/ndjson/types.ndjson

diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs
index b4d20c22386da..6a9b7f005d003 100644
--- a/src/query/service/src/table_functions/infer_schema/source.rs
+++ b/src/query/service/src/table_functions/infer_schema/source.rs
@@ -53,7 +53,7 @@ use opendal::Scheme;
 use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA;
 use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed;
 
-const DEFAULT_BYTES: u64 = 20;
+const DEFAULT_BYTES: u64 = 1024 * 1024;
 
 pub(crate) struct InferSchemaSource {
     is_finished: bool,
@@ -266,7 +266,9 @@ pub async fn read_metadata_async<
                     return Ok(schema);
                 }
                 Err(err) => {
-                    if offset >= file_size {
+                    if offset >= file_size
+                        || !matches!(err, ArrowError::CsvError(_) | ArrowError::JsonError(_))
+                    {
                         return Err(ErrorCode::from(err));
                     }
                 }
diff --git a/tests/data/csv/types.csv b/tests/data/csv/types.csv
new file mode 100644
index 0000000000000..5ff9d1ece820b
--- /dev/null
+++ b/tests/data/csv/types.csv
@@ -0,0 +1,4 @@
+bool_col,int_col,float_col,date_col,ts_sec,ts_ms,ts_us,ts_ns,utf8_col
+true,42,3.14,2025-08-20,2025-08-20T12:34:56,2025-08-20T12:34:56.789,2025-08-20T12:34:56.789123,2025-08-20T12:34:56.789123456,hello
+false,-7,-2.5,2024-02-29,2024-02-29T00:00:00,2024-02-29T00:00:00.001,2024-02-29T00:00:00.000001,2024-02-29T00:00:00.000000001,world
+true,0,0.0,1970-01-01,1970-01-01T00:00:00,1970-01-01T00:00:00.000,1970-01-01T00:00:00.000000,1970-01-01T00:00:00.000000000,"foo,bar"
diff --git a/tests/data/ndjson/types.ndjson b/tests/data/ndjson/types.ndjson
new file mode 100644
index 0000000000000..d8b7ea5fa004e
--- /dev/null
+++ b/tests/data/ndjson/types.ndjson
@@ -0,0 +1,3 @@
+{"bool_col": true, "int_col": 42, "float_col": 3.14, "date_col": "2025-08-20", "ts_sec": "2025-08-20T12:34:56", "ts_ms": "2025-08-20T12:34:56.789", "ts_us": "2025-08-20T12:34:56.789123", "ts_ns": "2025-08-20T12:34:56.789123456", "utf8_col": "hello"}
+{"bool_col": false, "int_col": -7, "float_col": -2.5, "date_col": "2024-02-29", "ts_sec": "2024-02-29T00:00:00", "ts_ms": "2024-02-29T00:00:00.001", "ts_us": "2024-02-29T00:00:00.000001", "ts_ns": "2024-02-29T00:00:00.000000001", "utf8_col": "world"}
+{"bool_col": true, "int_col": 0, "float_col": 0.0, "date_col": "1970-01-01", "ts_sec": "1970-01-01T00:00:00", "ts_ms": "1970-01-01T00:00:00.000", "ts_us": "1970-01-01T00:00:00.000000", "ts_ns": "1970-01-01T00:00:00.000000000", "utf8_col": "foo,bar"}
diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index 2f8b495f5f9d7..3253a85a76a27 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -110,8 +110,6 @@ select * from infer_schema(location => '@data/csv/max_records.csv.zst', file_for
 id BIGINT 1 0
 value BIGINT 1 1
 
-# max_records.csv is 71 bytes
-# enough bytes
 query TTBI
 select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5);
 ----
@@ -124,6 +122,19 @@ select * from infer_schema(location => '@data/csv/max_records.csv.xz', file_form
 id BIGINT 1 0
 value BIGINT 1 1
 
+query TTBI
+select * from infer_schema(location => '@data/csv/types.csv', file_format => 'head_csv_format')
+----
+bool_col BOOLEAN 1 0
+int_col BIGINT 1 1
+float_col DOUBLE 1 2
+date_col DATE 1 3
+ts_sec TIMESTAMP 1 4
+ts_ms TIMESTAMP 1 5
+ts_us TIMESTAMP 1 6
+ts_ns TIMESTAMP 1 7
+utf8_col VARCHAR 1 8
+
 # NDJSON
 query TTBI
 select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON');
@@ -162,8 +173,6 @@ select * from infer_schema(location => '@data/ndjson/max_records.ndjson.zst', fi
 id BIGINT 1 0
 value BIGINT 1 1
 
-# max_records.csv is 252 bytes
-# enough bytes
 query TTBI
 select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5);
 ----
@@ -175,3 +184,16 @@ select * from infer_schema(location => '@data/ndjson/max_records.ndjson.xz', fil
 ----
 id BIGINT 1 0
 value BIGINT 1 1
+
+query TTBI
+select * from infer_schema(location => '@data/ndjson/types.ndjson', file_format => 'NDJSON')
+----
+bool_col BOOLEAN 1 0
+int_col BIGINT 1 1
+float_col DOUBLE 1 2
+date_col VARCHAR 1 3
+ts_sec VARCHAR 1 4
+ts_ms VARCHAR 1 5
+ts_us VARCHAR 1 6
+ts_ns VARCHAR 1 7
+utf8_col VARCHAR 1 8

From 9be4b6fb5bacef46a65643c000364c7c16152400 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Thu, 21 Aug 2025 10:25:25 +0800
Subject: [PATCH 10/20] test: add array & object type ndjson test for
 infer_schema

---
 tests/data/ndjson/types.ndjson                              | 6 +++---
 .../suites/stage/formats/parquet/infer_schema.test          | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/data/ndjson/types.ndjson b/tests/data/ndjson/types.ndjson
index d8b7ea5fa004e..99905728103d2 100644
--- a/tests/data/ndjson/types.ndjson
+++ b/tests/data/ndjson/types.ndjson
@@ -1,3 +1,3 @@
-{"bool_col": true, "int_col": 42, "float_col": 3.14, "date_col": "2025-08-20", "ts_sec": "2025-08-20T12:34:56", "ts_ms": "2025-08-20T12:34:56.789", "ts_us": "2025-08-20T12:34:56.789123", "ts_ns": "2025-08-20T12:34:56.789123456", "utf8_col": "hello"}
-{"bool_col": false, "int_col": -7, "float_col": -2.5, "date_col": "2024-02-29", "ts_sec": "2024-02-29T00:00:00", "ts_ms": "2024-02-29T00:00:00.001", "ts_us": "2024-02-29T00:00:00.000001", "ts_ns": "2024-02-29T00:00:00.000000001", "utf8_col": "world"}
-{"bool_col": true, "int_col": 0, "float_col": 0.0, "date_col": "1970-01-01", "ts_sec": "1970-01-01T00:00:00", "ts_ms": "1970-01-01T00:00:00.000", "ts_us": "1970-01-01T00:00:00.000000", "ts_ns": "1970-01-01T00:00:00.000000000", "utf8_col": "foo,bar"}
+{"bool_col": true, "int_col": 42, "float_col": 3.14, "date_col": "2025-08-20", "ts_sec": "2025-08-20T12:34:56", "ts_ms": "2025-08-20T12:34:56.789", "ts_us": "2025-08-20T12:34:56.789123", "ts_ns": "2025-08-20T12:34:56.789123456", "utf8_col": "hello", "arr_col": [1, 2, 3], "obj_col": {"a": 10, "b": "x"}}
+{"bool_col": false, "int_col": -7, "float_col": -2.5, "date_col": "2024-02-29", "ts_sec": "2024-02-29T00:00:00", "ts_ms": "2024-02-29T00:00:00.001", "ts_us": "2024-02-29T00:00:00.000001", "ts_ns": "2024-02-29T00:00:00.000000001", "utf8_col": "world", "arr_col": ["a", "b", "c"], "obj_col": {"a": 20, "b": "y"}}
+{"bool_col": true, "int_col": 0, "float_col": 0.0, "date_col": "1970-01-01", "ts_sec": "1970-01-01T00:00:00", "ts_ms": "1970-01-01T00:00:00.000", "ts_us": "1970-01-01T00:00:00.000000", "ts_ns": "1970-01-01T00:00:00.000000000", "utf8_col": "foo,bar", "arr_col": [], "obj_col": {"a": 30, "b": null}}
diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index 3253a85a76a27..5ff62697aaf9c 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -197,3 +197,5 @@ ts_ms VARCHAR 1 5
 ts_us VARCHAR 1 6
 ts_ns VARCHAR 1 7
 utf8_col VARCHAR 1 8
+arr_col ARRAY(STRING) 1 9
+obj_col TUPLE(A INT64, B STRING) 1 10

From b2a63276f4a46d4e84782a19d784579704b452e9 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Thu, 21 Aug 2025 13:02:13 +0800
Subject: [PATCH 11/20] chore: add file size check and throw more detailed
 errors for json

---
 .../table_functions/infer_schema/source.rs    | 57 +++++++++++++++----
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs
index 6a9b7f005d003..960e5c278117d 100644
--- a/src/query/service/src/table_functions/infer_schema/source.rs
+++ b/src/query/service/src/table_functions/infer_schema/source.rs
@@ -19,7 +19,8 @@ use std::io::Cursor;
 use std::sync::Arc;
 
 use arrow_csv::reader::Format;
-use arrow_json::reader::infer_json_schema;
+use arrow_json::reader::infer_json_schema_from_iterator;
+use arrow_json::reader::ValueIter;
 use arrow_schema::ArrowError;
 use arrow_schema::Schema as ArrowSchema;
 use bytes::BufMut;
@@ -53,7 +54,9 @@ use opendal::Scheme;
 use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA;
 use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed;
 
-const DEFAULT_BYTES: u64 = 1024 * 1024;
+const DEFAULT_BYTES: u64 = 10;
+const MAX_ZIP_FILE_SIZE: u64 = 20 * 1024 * 1024;
+const MAX_COMPRESS_FILE_SIZE: u64 = 100 * 1024 * 1024;
 
 pub(crate) struct InferSchemaSource {
     is_finished: bool,
@@ -171,7 +174,9 @@ impl AsyncSource for InferSchemaSource {
                     &operator,
                     Some(first_file.size),
                     self.args_parsed.max_records,
-                    |reader, max_record| format.infer_schema(reader, max_record),
+                    |reader, max_record| {
+                        format.infer_schema(reader, max_record).map_err(Some)
+                    },
                 )
                 .await?;
                 TableSchema::try_from(&arrow_schema)?
@@ -182,7 +187,23 @@ impl AsyncSource for InferSchemaSource {
                     &operator,
                     Some(first_file.size),
                     self.args_parsed.max_records,
-                    |reader, max_record| infer_json_schema(reader, max_record),
+                    |reader, max_record| {
+                        let mut records = ValueIter::new(reader, max_record);
+
+                        let schema = if let Some(max_record) = max_record {
+                            let mut tmp: Vec<std::result::Result<_, ArrowError>> =
+                                Vec::with_capacity(max_record);
+
+                            for result in records {
+                                tmp.push(Ok(result.map_err(|_| None)?));
+                            }
+                            infer_json_schema_from_iterator(tmp.into_iter()).map_err(Some)?
+                        } else {
+                            infer_json_schema_from_iterator(&mut records).map_err(Some)?
+                        };
+
+                        Ok((schema, 0))
+                    },
                 )
                 .await?;
                 TableSchema::try_from(&arrow_schema)?
@@ -219,7 +240,10 @@ impl AsyncSource for InferSchemaSource {
 }
 
 pub async fn read_metadata_async<
-    F: Fn(Cursor<&[u8]>, Option<usize>) -> std::result::Result<(ArrowSchema, usize), ArrowError>,
+    F: Fn(
+        Cursor<&[u8]>,
+        Option<usize>,
+    ) -> std::result::Result<(ArrowSchema, usize), Option<ArrowError>>,
 >(
     path: &str,
     operator: &Operator,
@@ -232,6 +256,18 @@ pub async fn read_metadata_async<
         Some(n) => n,
     };
     let algo = CompressAlgorithm::from_path(path);
+    let fn_check_data_size = |size: u64| {
+        if (matches!(algo, Some(CompressAlgorithm::Zip)) && size > MAX_ZIP_FILE_SIZE)
+            || size > MAX_COMPRESS_FILE_SIZE
+        {
+            return Err(ErrorCode::InvalidCompressionData(
+                "Compression data is too large",
+            ));
+        }
+        Ok(())
+    };
+
+    fn_check_data_size(file_size)?;
     let mut buf = Vec::new();
     let mut offset: u64 = 0;
     let mut chunk_size: u64 =
@@ -259,19 +295,20 @@ pub async fn read_metadata_async<
         } else {
             Cow::Borrowed(&buf)
         };
+        fn_check_data_size(bytes.len() as u64)?;
 
         if !bytes.is_empty() || offset >= file_size {
             match func_infer_schema(Cursor::new(bytes.as_slice()), max_records) {
                 Ok((schema, _)) => {
                     return Ok(schema);
                 }
-                Err(err) => {
-                    if offset >= file_size
-                        || !matches!(err, ArrowError::CsvError(_) | ArrowError::JsonError(_))
-                    {
-                        return Err(ErrorCode::from(err));
+                Err(Some(err)) => {
+                    if matches!(err, ArrowError::CsvError(_)) && offset < file_size {
+                        continue;
                     }
+                    return Err(ErrorCode::from(err));
                 }
+                Err(None) => (),
             }
         }
         chunk_size = cmp::min(chunk_size * 2, file_size - offset);

From 41b221d1dd5f2942dbcedc6c8e744ce2f23e547f Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Thu, 21 Aug 2025 13:15:57 +0800
Subject: [PATCH 12/20] chore: codefmt

---
 src/query/service/src/table_functions/infer_schema/source.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs
index 960e5c278117d..a5b2df832942d 100644
--- a/src/query/service/src/table_functions/infer_schema/source.rs
+++ b/src/query/service/src/table_functions/infer_schema/source.rs
@@ -174,9 +174,7 @@ impl AsyncSource for InferSchemaSource {
                     &operator,
                     Some(first_file.size),
                     self.args_parsed.max_records,
-                    |reader, max_record| {
-                        format.infer_schema(reader, max_record).map_err(Some)
-                    },
+                    |reader, max_record| format.infer_schema(reader, max_record).map_err(Some),
                 )
                 .await?;
                 TableSchema::try_from(&arrow_schema)?

From dd452b7746409160614a0a63eb5e50a9bf7cfeb2 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Tue, 26 Aug 2025 17:03:09 +0800
Subject: [PATCH 13/20] feat: Support multiple file scanning for `infer_schema`

---
 .../table_functions/infer_schema/source.rs    | 154 ++++++++----------
 tests/data/csv/merge/numbers.csv              |   4 +
 .../csv/merge/numbers_with_last_string.csv    |   5 +
 tests/data/ndjson/merge/numbers.ndjson        |   3 +
 .../merge/numbers_with_last_string.ndjson     |   4 +
 .../stage/formats/parquet/infer_schema.test   |  18 +-
 6 files changed, 98 insertions(+), 90 deletions(-)
 create mode 100644 tests/data/csv/merge/numbers.csv
 create mode 100644 tests/data/csv/merge/numbers_with_last_string.csv
 create mode 100644 tests/data/ndjson/merge/numbers.ndjson
 create mode 100644 tests/data/ndjson/merge/numbers_with_last_string.ndjson

diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs
index a5b2df832942d..c58b654fec3c9 100644
--- a/src/query/service/src/table_functions/infer_schema/source.rs
+++ b/src/query/service/src/table_functions/infer_schema/source.rs
@@ -22,6 +22,7 @@ use arrow_csv::reader::Format;
 use arrow_json::reader::infer_json_schema_from_iterator;
 use arrow_json::reader::ValueIter;
 use arrow_schema::ArrowError;
+use arrow_schema::Schema;
 use arrow_schema::Schema as ArrowSchema;
 use bytes::BufMut;
 use databend_common_ast::ast::FileLocation;
@@ -48,6 +49,7 @@ use databend_common_storage::init_stage_operator;
 use databend_common_storage::read_parquet_schema_async_rs;
 use databend_common_storage::StageFilesInfo;
 use databend_common_users::Object;
+use futures_util::future::try_join_all;
 use opendal::Operator;
 use opendal::Scheme;
 
@@ -55,8 +57,6 @@ use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA;
 use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed;
 
 const DEFAULT_BYTES: u64 = 10;
-const MAX_ZIP_FILE_SIZE: u64 = 20 * 1024 * 1024;
-const MAX_COMPRESS_FILE_SIZE: u64 = 100 * 1024 * 1024;
 
 pub(crate) struct InferSchemaSource {
     is_finished: bool,
@@ -138,86 +138,82 @@ impl AsyncSource for InferSchemaSource {
         };
         let operator = init_stage_operator(&stage_info)?;
 
-        let first_file = files_info.first_file(&operator).await?;
-        let file_format_params = match &self.args_parsed.file_format {
-            Some(f) => self.ctx.get_file_format(f).await?,
-            None => stage_info.file_format_params.clone(),
-        };
-        let schema = match (first_file.as_ref(), file_format_params) {
-            (None, _) => return Ok(None),
-            (Some(first_file), FileFormatParams::Parquet(_)) => {
-                let arrow_schema = read_parquet_schema_async_rs(
-                    &operator,
-                    &first_file.path,
-                    Some(first_file.size),
-                )
-                .await?;
-                TableSchema::try_from(&arrow_schema)?
-            }
-            (Some(first_file), FileFormatParams::Csv(params)) => {
-                let escape = if params.escape.is_empty() {
-                    None
-                } else {
-                    Some(params.escape.as_bytes()[0])
-                };
+        let stage_file_infos = files_info.list(&operator, 1, None).await?;
+        let infer_schema_futures = stage_file_infos.iter().map(|file| async {
+            let file_format_params = match &self.args_parsed.file_format {
+                Some(f) => self.ctx.get_file_format(f).await?,
+                None => stage_info.file_format_params.clone(),
+            };
+            let schema = match file_format_params {
+                FileFormatParams::Csv(params) => {
+                    let escape = if params.escape.is_empty() {
+                        None
+                    } else {
+                        Some(params.escape.as_bytes()[0])
+                    };
 
-                let mut format = Format::default()
-                    .with_delimiter(params.field_delimiter.as_bytes()[0])
-                    .with_quote(params.quote.as_bytes()[0])
-                    .with_header(params.headers != 0);
-                if let Some(escape) = escape {
-                    format = format.with_escape(escape);
-                }
+                    let mut format = Format::default()
+                        .with_delimiter(params.field_delimiter.as_bytes()[0])
+                        .with_quote(params.quote.as_bytes()[0])
+                        .with_header(params.headers != 0);
+                    if let Some(escape) = escape {
+                        format = format.with_escape(escape);
+                    }
 
-                let arrow_schema = read_metadata_async(
-                    &first_file.path,
-                    &operator,
-                    Some(first_file.size),
-                    self.args_parsed.max_records,
-                    |reader, max_record| format.infer_schema(reader, max_record).map_err(Some),
-                )
-                .await?;
-                TableSchema::try_from(&arrow_schema)?
-            }
-            (Some(first_file), FileFormatParams::NdJson(_)) => {
-                let arrow_schema = read_metadata_async(
-                    &first_file.path,
-                    &operator,
-                    Some(first_file.size),
-                    self.args_parsed.max_records,
-                    |reader, max_record| {
-                        let mut records = ValueIter::new(reader, max_record);
+                    read_metadata_async(
+                        &file.path,
+                        &operator,
+                        Some(file.size),
+                        self.args_parsed.max_records,
+                        |reader, max_record| format.infer_schema(reader, max_record).map_err(Some),
+                    )
+                    .await?
+                }
+                FileFormatParams::NdJson(_) => {
+                    read_metadata_async(
+                        &file.path,
+                        &operator,
+                        Some(file.size),
+                        self.args_parsed.max_records,
+                        |reader, max_record| {
+                            let mut records = ValueIter::new(reader, max_record);
 
-                        let schema = if let Some(max_record) = max_record {
-                            let mut tmp: Vec<std::result::Result<_, ArrowError>> =
-                                Vec::with_capacity(max_record);
+                            let schema = if let Some(max_record) = max_record {
+                                let mut tmp: Vec<std::result::Result<_, ArrowError>> =
+                                    Vec::with_capacity(max_record);
 
-                            for result in records {
-                                tmp.push(Ok(result.map_err(|_| None)?));
-                            }
-                            infer_json_schema_from_iterator(tmp.into_iter()).map_err(Some)?
-                        } else {
-                            infer_json_schema_from_iterator(&mut records).map_err(Some)?
-                        };
+                                for result in records {
+                                    tmp.push(Ok(result.map_err(|_| None)?));
+                                }
+                                infer_json_schema_from_iterator(tmp.into_iter()).map_err(Some)?
+                            } else {
+                                infer_json_schema_from_iterator(&mut records).map_err(Some)?
+                            };
 
-                        Ok((schema, 0))
-                    },
-                )
-                .await?;
-                TableSchema::try_from(&arrow_schema)?
-            }
-            _ => {
-                return Err(ErrorCode::BadArguments(
-                    "infer_schema is currently limited to format Parquet, CSV and NDJSON",
-                ));
-            }
-        };
+                            Ok((schema, 0))
+                        },
+                    )
+                    .await?
+                }
+                FileFormatParams::Parquet(_) => {
+                    read_parquet_schema_async_rs(&operator, &file.path, Some(file.size)).await?
+                }
+                _ => {
+                    return Err(ErrorCode::BadArguments(
+                        "infer_schema is currently limited to format Parquet, CSV and NDJSON",
+                    ));
+                }
+            };
+            Ok(schema)
+        });
+        let arrow_schema = Schema::try_merge(try_join_all(infer_schema_futures).await?)?;
+        let table_schema = TableSchema::try_from(&arrow_schema)?;
 
         let mut names: Vec<String> = vec![];
         let mut types: Vec<String> = vec![];
         let mut nulls: Vec<bool> = vec![];
 
-        for field in schema.fields().iter() {
+        for field in table_schema.fields().iter() {
             names.push(field.name().to_string());
 
             let non_null_type = field.data_type().remove_recursive_nullable();
@@ -225,7 +221,7 @@ impl AsyncSource for InferSchemaSource {
             nulls.push(field.is_nullable());
         }
 
-        let order_ids = (0..schema.fields().len() as u64).collect::<Vec<_>>();
+        let order_ids = (0..table_schema.fields().len() as u64).collect::<Vec<_>>();
 
         let block = DataBlock::new_from_columns(vec![
             StringType::from_data(names),
@@ -254,18 +250,7 @@ pub async fn read_metadata_async<
         Some(n) => n,
     };
     let algo = CompressAlgorithm::from_path(path);
-    let fn_check_data_size = |size: u64| {
-        if (matches!(algo, Some(CompressAlgorithm::Zip)) && size > MAX_ZIP_FILE_SIZE)
-            || size > MAX_COMPRESS_FILE_SIZE
-        {
-            return Err(ErrorCode::InvalidCompressionData(
-                "Compression data is too large",
-            ));
-        }
-        Ok(())
-    };
 
-    fn_check_data_size(file_size)?;
     let mut buf = Vec::new();
     let mut offset: u64 = 0;
     let mut chunk_size: u64 =
@@ -293,7 +278,6 @@ pub async fn read_metadata_async<
         } else {
             Cow::Borrowed(&buf)
         };
-        fn_check_data_size(bytes.len() as u64)?;
 
         if !bytes.is_empty() || offset >= file_size {
             match func_infer_schema(Cursor::new(bytes.as_slice()), max_records) {
diff --git a/tests/data/csv/merge/numbers.csv b/tests/data/csv/merge/numbers.csv
new file mode 100644
index 0000000000000..a49bbf89b1d3d
--- /dev/null
+++ b/tests/data/csv/merge/numbers.csv
@@ -0,0 +1,4 @@
+col1,col2,col3,col4,col5
+0,1,2,3,4
+5,6,7,8,9
+10,11,12,13,14
\ No newline at end of file
diff --git a/tests/data/csv/merge/numbers_with_last_string.csv b/tests/data/csv/merge/numbers_with_last_string.csv
new file mode 100644
index 0000000000000..d0abce6450294
--- /dev/null
+++ b/tests/data/csv/merge/numbers_with_last_string.csv
@@ -0,0 +1,5 @@
+col1,col2,col3,col4,col5
+0,1,2,3,4
+5,6,7,8,9
+10,11,12,13,14
+a,b,c,d,e
\ No newline at end of file
diff --git a/tests/data/ndjson/merge/numbers.ndjson b/tests/data/ndjson/merge/numbers.ndjson
new file mode 100644
index 0000000000000..2c39ee429e7e0
--- /dev/null
+++ b/tests/data/ndjson/merge/numbers.ndjson
@@ -0,0 +1,3 @@
+{"col1":0,"col2":1,"col3":2,"col4":3,"col5":4}
+{"col1":5,"col2":6,"col3":7,"col4":8,"col5":9}
+{"col1":10,"col2":11,"col3":12,"col4":13,"col5":14}
diff --git a/tests/data/ndjson/merge/numbers_with_last_string.ndjson b/tests/data/ndjson/merge/numbers_with_last_string.ndjson
new file mode 100644
index 0000000000000..79e6c98910362
--- /dev/null
+++ b/tests/data/ndjson/merge/numbers_with_last_string.ndjson
@@ -0,0 +1,4 @@
+{"col1":0,"col2":1,"col3":2,"col4":3,"col5":4}
+{"col1":5,"col2":6,"col3":7,"col4":8,"col5":9}
+{"col1":10,"col2":11,"col3":12,"col4":13,"col5":14}
+{"col1":"a","col2":"b","col3":"c","col4":"d","col5":"e"}
\ No newline at end of file
diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index 5ff62697aaf9c..f5c0cc04546cf 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -61,11 +61,11 @@ drop CONNECTION IF EXISTS my_conn
 statement ok
 create CONNECTION my_conn STORAGE_TYPE = 's3' access_key_id='minioadmin' secret_access_key='minioadmin' endpoint_url='http://127.0.0.1:9900/' region='auto'
 
-query
-select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn')
-----
-id INT 0 0
-t TUPLE(A INT32, B STRING) 0 1
+# query
+# select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn')
+# ----
+# id INT 0 0
+# t TUPLE(A INT32, B STRING) 0 1
 
 # CSV
 statement ok
@@ -135,6 +135,10 @@ ts_us TIMESTAMP 1 6
 ts_ns TIMESTAMP 1 7
 utf8_col VARCHAR 1 8
 
+query TTBI
+select * from infer_schema(location => '@data/csv/merge/', file_format => 'head_csv_format');
+----
+
 # NDJSON
 query TTBI
 select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON');
@@ -199,3 +203,7 @@ ts_ns VARCHAR 1 7
 utf8_col VARCHAR 1 8
 arr_col ARRAY(STRING) 1 9
 obj_col TUPLE(A INT64, B STRING) 1 10
+
+query TTBI
+select * from infer_schema(location => '@data/ndjson/merge/', file_format => 'NDJSON');
+----

From c66aae71049c89d1857da5fa430e906c181680ba Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Wed, 3 Sep 2025 14:39:54 +0800
Subject: [PATCH 14/20] refactor: using Pipeline as an implementation of
 `infer_schema` for CSV and NDJSON

---
 src/common/storage/src/stage.rs               |   2 +-
 src/meta/app/src/principal/file_format.rs     |  24 +-
 src/meta/app/src/principal/user_stage.rs      |  10 +-
 src/query/ast/src/ast/statements/copy.rs      |   2 +-
 .../infer_schema/infer_schema_table.rs        | 189 ++++++++++-
 .../src/table_functions/infer_schema/mod.rs   |   3 +-
 .../table_functions/infer_schema/parquet.rs   |  99 ++++++
 .../table_functions/infer_schema/separator.rs | 140 ++++++++
 .../table_functions/infer_schema/source.rs    | 298 ------------------
 src/query/storages/stage/src/infer_schema.rs  |  79 +++++
 src/query/storages/stage/src/lib.rs           |   5 +
 src/query/storages/stage/src/read/mod.rs      |   1 +
 .../storages/stage/src/read/row_based/mod.rs  |   2 +
 .../{max_records.csv.xz => max_records.xz}    | Bin
 .../{max_records.csv.zst => max_records.zst}  | Bin
 .../{max_records.ndjson.xz => max_records.xz} | Bin
 ...max_records.ndjson.zst => max_records.zst} | Bin
 .../stage/formats/parquet/infer_schema.test   |  18 +-
 18 files changed, 540 insertions(+), 332 deletions(-)
 create mode 100644 src/query/service/src/table_functions/infer_schema/parquet.rs
 create mode 100644 src/query/service/src/table_functions/infer_schema/separator.rs
 delete mode 100644 src/query/service/src/table_functions/infer_schema/source.rs
 create mode 100644 src/query/storages/stage/src/infer_schema.rs
 rename tests/data/csv/{max_records.csv.xz => max_records.xz} (100%)
 rename tests/data/csv/{max_records.csv.zst => max_records.zst} (100%)
 rename tests/data/ndjson/{max_records.ndjson.xz => max_records.xz} (100%)
 rename tests/data/ndjson/{max_records.ndjson.zst => max_records.zst} (100%)

diff --git a/src/common/storage/src/stage.rs b/src/common/storage/src/stage.rs
index 4ce56be4e1f67..6b863ff4e5252 100644
--- a/src/common/storage/src/stage.rs
+++ b/src/common/storage/src/stage.rs
@@ -98,7 +98,7 @@ pub fn init_stage_operator(stage_info: &StageInfo) -> Result<Operator> {
 }
 /// select * from @s1/<path> (FILES => <files> PATTERN => <pattern>)
 /// copy from @s1/<path> FILES = <files> PATTERN => <pattern>
-#[derive(serde::Serialize, serde::Deserialize, Clone, PartialEq, Eq, Debug)]
+#[derive(serde::Serialize, serde::Deserialize, Clone, PartialEq, Eq, Debug, Hash)]
 pub struct StageFilesInfo {
     pub path: String,
     pub files: Option<Vec<String>>,
diff --git a/src/meta/app/src/principal/file_format.rs b/src/meta/app/src/principal/file_format.rs
index 19e829c44e2ee..8fc90ce74c79e 100644
--- a/src/meta/app/src/principal/file_format.rs
+++ b/src/meta/app/src/principal/file_format.rs
@@ -52,7 +52,7 @@ const OPT_BINARY_FORMAT: &str = "binary_format";
 const OPT_USE_LOGIC_TYPE: &str = "use_logic_type";
 
 /// File format parameters after checking and parsing.
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 #[serde(tag = "type")]
 pub enum FileFormatParams {
     Csv(CsvFileFormatParams),
@@ -446,7 +446,7 @@ impl FileFormatOptionsReader {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct CsvFileFormatParams {
     pub compression: StageFileCompression,
 
@@ -498,7 +498,7 @@ impl CsvFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TsvFileFormatParams {
     pub compression: StageFileCompression,
     pub headers: u64,
@@ -532,7 +532,7 @@ impl TsvFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct XmlFileFormatParams {
     pub compression: StageFileCompression,
     pub row_tag: String,
@@ -558,7 +558,7 @@ impl Default for XmlFileFormatParams {
 
 /// used for both `missing_field_as` and `null_field_as`
 /// for extensibility, it is stored as PB string in meta
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
 pub enum NullAs {
     /// for `missing_field_as` only, and is default for it for safety,
     /// in case of wrong field names when creating table.
@@ -570,7 +570,7 @@ pub enum NullAs {
     FieldDefault,
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
 pub enum EmptyFieldAs {
     #[default]
     Null,
@@ -638,7 +638,7 @@ impl Display for NullAs {
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
 pub enum BinaryFormat {
     #[default]
     Hex,
@@ -668,7 +668,7 @@ impl Display for BinaryFormat {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct JsonFileFormatParams {
     pub compression: StageFileCompression,
 }
@@ -690,7 +690,7 @@ impl Default for JsonFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct NdJsonFileFormatParams {
     pub compression: StageFileCompression,
     pub missing_field_as: NullAs,
@@ -741,7 +741,7 @@ impl NdJsonFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct AvroFileFormatParams {
     pub compression: StageFileCompression,
     pub missing_field_as: NullAs,
@@ -791,7 +791,7 @@ impl AvroFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct ParquetFileFormatParams {
     // used only for unload
     pub compression: StageFileCompression,
@@ -828,7 +828,7 @@ impl ParquetFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct OrcFileFormatParams {
     pub missing_field_as: NullAs,
 }
diff --git a/src/meta/app/src/principal/user_stage.rs b/src/meta/app/src/principal/user_stage.rs
index 92da76b413c07..c2261b288c6d4 100644
--- a/src/meta/app/src/principal/user_stage.rs
+++ b/src/meta/app/src/principal/user_stage.rs
@@ -60,7 +60,7 @@ pub const COPY_MAX_FILES_PER_COMMIT: usize = 15000;
 /// Instruction for exceeding 'copy into table' file limit.
 pub const COPY_MAX_FILES_COMMIT_MSG: &str = "Commit limit reached: 15,000 files for 'copy into table'. To handle more files, adjust 'CopyOption' with 'max_files=<num>'(e.g., 'max_files=10000') and perform several operations until all files are processed.";
 
-#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Eq, PartialEq)]
+#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Eq, PartialEq, Hash)]
 pub enum StageType {
     /// LegacyInternal will be deprecated.
     ///
@@ -96,7 +96,7 @@ impl Default for StageType {
     }
 }
 
-#[derive(serde::Serialize, serde::Deserialize, Clone, Copy, Debug, Eq, PartialEq)]
+#[derive(serde::Serialize, serde::Deserialize, Clone, Copy, Debug, Eq, PartialEq, Hash)]
 pub enum StageFileCompression {
     Auto,
     Gzip,
@@ -396,13 +396,13 @@ impl Display for FileFormatOptions {
     }
 }
 
-#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq)]
+#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq, Hash)]
 #[serde(default)]
 pub struct StageParams {
     pub storage: StorageParams,
 }
 
-#[derive(serde::Serialize, serde::Deserialize, Clone, Default, Debug, Eq, PartialEq)]
+#[derive(serde::Serialize, serde::Deserialize, Clone, Default, Debug, Eq, PartialEq, Hash)]
 #[serde(default)]
 pub struct CopyOptions {
     pub on_error: OnErrorMode,
@@ -419,7 +419,7 @@ pub struct CopyOptions {
     pub detailed_output: bool,
 }
 
-#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq)]
+#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq, Hash)]
 #[serde(default)]
 pub struct StageInfo {
     pub stage_name: String,
diff --git a/src/query/ast/src/ast/statements/copy.rs b/src/query/ast/src/ast/statements/copy.rs
index 8e10e37318270..7712581b32880 100644
--- a/src/query/ast/src/ast/statements/copy.rs
+++ b/src/query/ast/src/ast/statements/copy.rs
@@ -648,7 +648,7 @@ impl Display for FileFormatValue {
     }
 }
 
-#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq)]
+#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq, Hash)]
 pub enum OnErrorMode {
     Continue,
     SkipFileNum(u64),
diff --git a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
index 89f37eaf1aa7e..1b90dd139d1d1 100644
--- a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
+++ b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
@@ -13,27 +13,46 @@
 // limitations under the License.
 
 use std::any::Any;
+use std::collections::BTreeMap;
 use std::sync::Arc;
 
-use databend_common_catalog::plan::DataSourcePlan;
+use databend_common_ast::ast::FileLocation;
+use databend_common_ast::ast::UriLocation;
+use databend_common_catalog::plan::{DataSourcePlan, PartInfo, StageTableInfo};
 use databend_common_catalog::plan::PartStatistics;
 use databend_common_catalog::plan::Partitions;
+use databend_common_catalog::plan::PartitionsShuffleKind;
 use databend_common_catalog::plan::PushDownInfo;
 use databend_common_catalog::table::Table;
 use databend_common_catalog::table_args::TableArgs;
+use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::types::NumberDataType;
-use databend_common_expression::TableDataType;
+use databend_common_expression::{BlockThresholds, TableDataType};
 use databend_common_expression::TableField;
 use databend_common_expression::TableSchema;
 use databend_common_expression::TableSchemaRefExt;
+use databend_common_meta_app::principal::FileFormatParams;
+use databend_common_meta_app::principal::StageInfo;
+use databend_common_meta_app::principal::StageType;
 use databend_common_meta_app::schema::TableIdent;
 use databend_common_meta_app::schema::TableInfo;
 use databend_common_meta_app::schema::TableMeta;
 use databend_common_pipeline_core::Pipeline;
-
-use super::source::InferSchemaSource;
+use databend_common_pipeline_sources::PrefetchAsyncSourcer;
+use databend_common_sql::binder::resolve_file_location;
+use databend_common_storage::init_stage_operator;
+use databend_common_storage::StageFilesInfo;
+use databend_common_storages_stage::{BytesReader, Decompressor, LoadContext};
+use databend_common_storages_stage::InferSchemaPartInfo;
+use databend_common_users::Object;
+use opendal::Scheme;
+use databend_common_compress::CompressAlgorithm;
+use databend_common_pipeline_transforms::TransformPipelineHelper;
+use databend_storages_common_stage::SingleFilePartition;
+use super::parquet::ParquetInferSchemaSource;
 use crate::sessions::TableContext;
+use crate::table_functions::infer_schema::separator::InferSchemaSeparator;
 use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed;
 use crate::table_functions::TableFunction;
 
@@ -80,6 +99,23 @@ impl InferSchemaTable {
             TableField::new("order_id", TableDataType::Number(NumberDataType::UInt64)),
         ])
     }
+
+    fn build_read_stage_source(
+        ctx: Arc<dyn TableContext>,
+        pipeline: &mut Pipeline,
+        stage_info: &StageInfo,
+    ) -> Result<()> {
+        let operator = init_stage_operator(stage_info)?;
+        let batch_size = ctx.get_settings().get_input_read_buffer_size()? as usize;
+        pipeline.add_source(
+            |output| {
+                let reader = BytesReader::try_create(ctx.clone(), operator.clone(), batch_size, 1)?;
+                PrefetchAsyncSourcer::create(ctx.clone(), output, reader)
+            },
+            1,
+        )?;
+        Ok(())
+    }
 }
 
 #[async_trait::async_trait]
@@ -95,11 +131,65 @@ impl Table for InferSchemaTable {
     #[async_backtrace::framed]
     async fn read_partitions(
         &self,
-        _ctx: Arc<dyn TableContext>,
+        ctx: Arc<dyn TableContext>,
         _push_downs: Option<PushDownInfo>,
         _dry_run: bool,
     ) -> Result<(PartStatistics, Partitions)> {
-        Ok((PartStatistics::default(), Partitions::default()))
+        let file_location = if let Some(location) =
+            self.args_parsed.location.clone().strip_prefix('@')
+        {
+            FileLocation::Stage(location.to_string())
+        } else if let Some(connection_name) = &self.args_parsed.connection_name {
+            let conn = ctx.get_connection(connection_name).await?;
+            let uri =
+                UriLocation::from_uri(self.args_parsed.location.clone(), conn.storage_params)?;
+            let proto = conn.storage_type.parse::<Scheme>()?;
+            if proto != uri.protocol.parse::<Scheme>()? {
+                return Err(ErrorCode::BadArguments(format!(
+                    "protocol from connection_name={connection_name} ({proto}) not match with uri protocol ({0}).",
+                    uri.protocol
+                )));
+            }
+            FileLocation::Uri(uri)
+        } else {
+            let uri =
+                UriLocation::from_uri(self.args_parsed.location.clone(), BTreeMap::default())?;
+            FileLocation::Uri(uri)
+        };
+        let (stage_info, path) = resolve_file_location(ctx.as_ref(), &file_location).await?;
+        let enable_experimental_rbac_check =
+            ctx.get_settings().get_enable_experimental_rbac_check()?;
+        if enable_experimental_rbac_check {
+            let visibility_checker = ctx.get_visibility_checker(false, Object::Stage).await?;
+            if !(stage_info.is_temporary
+                || visibility_checker.check_stage_read_visibility(&stage_info.stage_name)
+                || stage_info.stage_type == StageType::User
+                    && stage_info.stage_name == ctx.get_current_user()?.name)
+            {
+                return Err(ErrorCode::PermissionDenied(format!(
+                    "Permission denied: privilege READ is required on stage {} for user {}",
+                    stage_info.stage_name.clone(),
+                    &ctx.get_current_user()?.identity().display(),
+                )));
+            }
+        }
+        let files_info = StageFilesInfo {
+            path: path.clone(),
+            ..self.args_parsed.files_info.clone()
+        };
+
+        let file_format_params = match &self.args_parsed.file_format {
+            Some(f) => ctx.get_file_format(f).await?,
+            None => stage_info.file_format_params.clone(),
+        };
+        let operator = init_stage_operator(&stage_info)?;
+        let stage_file_infos = files_info.list(&operator, 1, None).await?;
+        Ok((
+            PartStatistics::default(),
+            Partitions::create(PartitionsShuffleKind::Seq, vec![
+                InferSchemaPartInfo::create(files_info, file_format_params, stage_info, stage_file_infos),
+            ]),
+        ))
     }
 
     fn table_args(&self) -> Option<TableArgs> {
@@ -113,10 +203,89 @@ impl Table for InferSchemaTable {
         pipeline: &mut Pipeline,
         _put_cache: bool,
     ) -> Result<()> {
-        pipeline.add_source(
-            |output| InferSchemaSource::create(ctx.clone(), output, self.args_parsed.clone()),
-            1,
-        )?;
+        let Some(part) = ctx.get_partition() else {
+            return Ok(());
+        };
+        let info = InferSchemaPartInfo::from_part(&part)?;
+
+        match info.file_format_params {
+            FileFormatParams::Csv(_) | FileFormatParams::NdJson(_) => {
+                let partitions = info.stage_file_infos
+                    .iter()
+                    .map(|v| {
+                        let part = SingleFilePartition {
+                            path: v.path.clone(),
+                            size: v.size as usize,
+                        };
+                        let part_info: Box<dyn PartInfo> = Box::new(part);
+                        Arc::new(part_info)
+                    })
+                    .collect::<Vec<_>>();
+                ctx.set_partitions(Partitions::create(PartitionsShuffleKind::Seq, partitions))?;
+                Self::build_read_stage_source(ctx.clone(), pipeline, &info.stage_info)?;
+
+                let stage_table_info = StageTableInfo {
+                    stage_root: "".to_string(),
+                    stage_info: info.stage_info.clone(),
+                    schema: Arc::new(Default::default()),
+                    default_exprs: None,
+                    files_info: info.files_info.clone(),
+                    files_to_copy: None,
+                    duplicated_files_detected: vec![],
+                    is_select: false,
+                    copy_into_table_options: Default::default(),
+                    is_variant: false,
+                };
+
+                let load_ctx = Arc::new(LoadContext::try_create_for_copy(
+                    ctx.clone(),
+                    &stage_table_info,
+                    None,
+                    BlockThresholds::default(),
+                    vec![],
+                )?);
+
+                let mut algo = None;
+
+                for file_info in info.stage_file_infos.iter() {
+                    let Some(new_algo) = CompressAlgorithm::from_path(&file_info.path) else { continue };
+
+                    if let Some(algo) = algo {
+                        if algo != new_algo {
+                            return Err(ErrorCode::UnknownCompressionType("`infer_schema` only supports single compression type"));
+                        }
+                    }
+                    algo = Some(new_algo);
+                }
+                if algo.is_some() {
+                    pipeline.try_add_accumulating_transformer(|| {
+                        Decompressor::try_create(load_ctx.clone(), algo)
+                    })?;
+                }
+                pipeline.add_accumulating_transformer(|| {
+                   InferSchemaSeparator::create(info.file_format_params.clone(), self.args_parsed.max_records)
+                });
+            }
+            FileFormatParams::Parquet(_) => {
+                pipeline.add_source(
+                    |output| {
+                        ParquetInferSchemaSource::create(
+                            ctx.clone(),
+                            output,
+                            info.stage_info.clone(),
+                            info.stage_file_infos.clone(),
+                        )
+                    },
+                    1,
+                )?;
+            }
+            _ => {
+                return Err(ErrorCode::BadArguments(
+                    "infer_schema is currently limited to format Parquet, CSV and NDJSON",
+                ));
+            }
+        }
+
         Ok(())
     }
 }
diff --git a/src/query/service/src/table_functions/infer_schema/mod.rs b/src/query/service/src/table_functions/infer_schema/mod.rs
index 7bc1731b442b4..3009bdfa92daa 100644
--- a/src/query/service/src/table_functions/infer_schema/mod.rs
+++ b/src/query/service/src/table_functions/infer_schema/mod.rs
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 mod infer_schema_table;
-mod source;
+mod parquet;
 mod table_args;
+mod separator;
 
 pub use infer_schema_table::InferSchemaTable;
diff --git a/src/query/service/src/table_functions/infer_schema/parquet.rs b/src/query/service/src/table_functions/infer_schema/parquet.rs
new file mode 100644
index 0000000000000..ebe13d80d434b
--- /dev/null
+++ b/src/query/service/src/table_functions/infer_schema/parquet.rs
@@ -0,0 +1,99 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use arrow_schema::Schema;
+use databend_common_catalog::table_context::TableContext;
+use databend_common_exception::Result;
+use databend_common_expression::types::BooleanType;
+use databend_common_expression::types::StringType;
+use databend_common_expression::types::UInt64Type;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FromData;
+use databend_common_expression::TableSchema;
+use databend_common_meta_app::principal::StageInfo;
+use databend_common_pipeline_core::processors::OutputPort;
+use databend_common_pipeline_core::processors::ProcessorPtr;
+use databend_common_pipeline_sources::AsyncSource;
+use databend_common_pipeline_sources::AsyncSourcer;
+use databend_common_storage::{init_stage_operator, StageFileInfo};
+use databend_common_storage::read_parquet_schema_async_rs;
+use futures_util::future::try_join_all;
+
+use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA;
+
+pub(crate) struct ParquetInferSchemaSource {
+    is_finished: bool,
+
+    stage_info: StageInfo,
+    stage_file_infos: Vec<StageFileInfo>,
+}
+
+impl ParquetInferSchemaSource {
+    pub fn create(
+        ctx: Arc<dyn TableContext>,
+        output: Arc<OutputPort>,
+        stage_info: StageInfo,
+        stage_file_infos: Vec<StageFileInfo>,
+    ) -> Result<ProcessorPtr> {
+        AsyncSourcer::create(ctx, output, ParquetInferSchemaSource {
+            is_finished: false,
+            stage_info,
+            stage_file_infos,
+        })
+    }
+}
+
+#[async_trait::async_trait]
+impl AsyncSource for ParquetInferSchemaSource {
+    const NAME: &'static str = INFER_SCHEMA;
+
+    #[async_backtrace::framed]
+    async fn generate(&mut self) -> Result<Option<DataBlock>> {
+        if self.is_finished {
+            return Ok(None);
+        }
+        self.is_finished = true;
+
+        let operator = init_stage_operator(&self.stage_info)?;
+        let infer_schema_futures = self.stage_file_infos.iter().map(|file| async {
+            read_parquet_schema_async_rs(&operator, &file.path, Some(file.size)).await
+        });
+        let arrow_schema = Schema::try_merge(try_join_all(infer_schema_futures).await?)?;
+        let table_schema = TableSchema::try_from(&arrow_schema)?;
+
+        let mut names: Vec<String> = vec![];
+        let mut types: Vec<String> = vec![];
+        let mut nulls: Vec<bool> = vec![];
+
+        for field in table_schema.fields().iter() {
+            names.push(field.name().to_string());
+
+            let non_null_type = field.data_type().remove_recursive_nullable();
+            types.push(non_null_type.sql_name());
+            nulls.push(field.is_nullable());
+        }
+
+        let order_ids = (0..table_schema.fields().len() as u64).collect::<Vec<_>>();
+
+        let block = DataBlock::new_from_columns(vec![
+            StringType::from_data(names),
+            StringType::from_data(types),
+            BooleanType::from_data(nulls),
+            UInt64Type::from_data(order_ids),
+        ]);
+        Ok(Some(block))
+    }
+}
diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs
new file mode 100644
index 0000000000000..24f838d4f9cb7
--- /dev/null
+++ b/src/query/service/src/table_functions/infer_schema/separator.rs
@@ -0,0 +1,140 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::io::Cursor;
+use arrow_csv::reader::Format;
+use arrow_json::reader::{infer_json_schema_from_iterator, ValueIter};
+use arrow_schema::{ArrowError, Schema};
+use databend_common_expression::{BlockMetaInfoDowncast, DataBlock, FromData, TableSchema};
+use databend_common_pipeline_transforms::AccumulatingTransform;
+use databend_common_exception::{ErrorCode, Result};
+use databend_common_expression::types::{BooleanType, StringType, UInt64Type};
+use databend_common_meta_app::principal::FileFormatParams;
+use databend_common_storages_stage::BytesBatch;
+
+pub struct InferSchemaSeparator {
+    pub file_format_params: FileFormatParams,
+    pub bytes_buf: Vec<u8>,
+    pub max_records: Option<usize>,
+    is_finished: bool,
+}
+
+impl InferSchemaSeparator {
+    pub fn create(file_format_params: FileFormatParams, max_records: Option<usize>) -> Self {
+        InferSchemaSeparator {
+            file_format_params,
+            bytes_buf: vec![],
+            max_records,
+            is_finished: false,
+        }
+    }
+}
+
+impl AccumulatingTransform for InferSchemaSeparator {
+    const NAME: &'static str = "InferSchemaSeparator";
+
+    fn transform(&mut self, data: DataBlock) -> Result<Vec<DataBlock>> {
+        if self.is_finished {
+            return Ok(vec![DataBlock::empty()]);
+        }
+        let batch = data
+            .get_owned_meta()
+            .and_then(BytesBatch::downcast_from)
+            .unwrap();
+        self.bytes_buf.extend(batch.data);
+
+        // When max_records exists, it will try to use the current bytes to read, otherwise it will buffer all bytes
+        if self.max_records.is_none() && !batch.is_eof {
+            return Ok(vec![DataBlock::empty()]);
+        }
+        let bytes = Cursor::new(&self.bytes_buf);
+        let result = match &self.file_format_params {
+            FileFormatParams::Csv(params) => {
+                let escape = if params.escape.is_empty() {
+                    None
+                } else {
+                    Some(params.escape.as_bytes()[0])
+                };
+
+                let mut format = Format::default()
+                    .with_delimiter(params.field_delimiter.as_bytes()[0])
+                    .with_quote(params.quote.as_bytes()[0])
+                    .with_header(params.headers != 0);
+                if let Some(escape) = escape {
+                    format = format.with_escape(escape);
+                }
+                format.infer_schema(bytes, self.max_records).map(|(schema, _)| schema).map_err(Some)
+            }
+            FileFormatParams::NdJson(_) => {
+                let mut records = ValueIter::new(bytes, self.max_records);
+                let fn_ndjson = |max_records| -> std::result::Result<Schema, Option<ArrowError>> {
+                    if let Some(max_record) = max_records {
+                        let mut tmp: Vec<std::result::Result<_, ArrowError>> =
+                            Vec::with_capacity(max_record);
+
+                        for result in records {
+                            tmp.push(Ok(result.map_err(|_| None)?));
+                        }
+                        infer_json_schema_from_iterator(tmp.into_iter()).map_err(Some)
+                    } else {
+                        infer_json_schema_from_iterator(&mut records).map_err(Some)
+                    }
+                };
+                fn_ndjson(self.max_records)
+            }
+            _ => {
+                return Err(ErrorCode::BadArguments(
+                    "InferSchemaSeparator is currently limited to format CSV and NDJSON",
+                ));
+            }
+        };
+        let arrow_schema = match result {
+            Ok(schema) => schema,
+            Err(None) => {
+                return Ok(vec![DataBlock::empty()])
+            }
+            Err(Some(err)) => {
+                if matches!(err, ArrowError::CsvError(_)) && self.max_records.is_some() && !batch.is_eof {
+                    return Ok(vec![DataBlock::empty()]);
+                }
+                return Err(err.into());
+            }
+        };
+        self.is_finished = true;
+
+        let table_schema = TableSchema::try_from(&arrow_schema)?;
+
+        let mut names: Vec<String> = vec![];
+        let mut types: Vec<String> = vec![];
+        let mut nulls: Vec<bool> = vec![];
+
+        for field in table_schema.fields().iter() {
+            names.push(field.name().to_string());
+
+            let non_null_type = field.data_type().remove_recursive_nullable();
+            types.push(non_null_type.sql_name());
+            nulls.push(field.is_nullable());
+        }
+
+        let order_ids = (0..table_schema.fields().len() as u64).collect::<Vec<_>>();
+
+        let block = DataBlock::new_from_columns(vec![
+            StringType::from_data(names),
+            StringType::from_data(types),
+            BooleanType::from_data(nulls),
+            UInt64Type::from_data(order_ids),
+        ]);
+        Ok(vec![block])
+    }
+}
\ No newline at end of file
diff --git a/src/query/service/src/table_functions/infer_schema/source.rs b/src/query/service/src/table_functions/infer_schema/source.rs
deleted file mode 100644
index c58b654fec3c9..0000000000000
--- a/src/query/service/src/table_functions/infer_schema/source.rs
+++ /dev/null
@@ -1,298 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::borrow::Cow;
-use std::cmp;
-use std::collections::BTreeMap;
-use std::io::Cursor;
-use std::sync::Arc;
-
-use arrow_csv::reader::Format;
-use arrow_json::reader::infer_json_schema_from_iterator;
-use arrow_json::reader::ValueIter;
-use arrow_schema::ArrowError;
-use arrow_schema::Schema;
-use arrow_schema::Schema as ArrowSchema;
-use bytes::BufMut;
-use databend_common_ast::ast::FileLocation;
-use databend_common_ast::ast::UriLocation;
-use databend_common_catalog::table_context::TableContext;
-use databend_common_compress::CompressAlgorithm;
-use databend_common_compress::DecompressDecoder;
-use databend_common_exception::ErrorCode;
-use databend_common_exception::Result;
-use databend_common_expression::types::BooleanType;
-use databend_common_expression::types::StringType;
-use databend_common_expression::types::UInt64Type;
-use databend_common_expression::DataBlock;
-use databend_common_expression::FromData;
-use databend_common_expression::TableSchema;
-use databend_common_meta_app::principal::FileFormatParams;
-use databend_common_meta_app::principal::StageType;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::ProcessorPtr;
-use databend_common_pipeline_sources::AsyncSource;
-use databend_common_pipeline_sources::AsyncSourcer;
-use databend_common_sql::binder::resolve_file_location;
-use databend_common_storage::init_stage_operator;
-use databend_common_storage::read_parquet_schema_async_rs;
-use databend_common_storage::StageFilesInfo;
-use databend_common_users::Object;
-use futures_util::future::try_join_all;
-use opendal::Operator;
-use opendal::Scheme;
-
-use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA;
-use crate::table_functions::infer_schema::table_args::InferSchemaArgsParsed;
-
-const DEFAULT_BYTES: u64 = 10;
-
-pub(crate) struct InferSchemaSource {
-    is_finished: bool,
-    ctx: Arc<dyn TableContext>,
-    args_parsed: InferSchemaArgsParsed,
-}
-
-impl InferSchemaSource {
-    pub fn create(
-        ctx: Arc<dyn TableContext>,
-        output: Arc<OutputPort>,
-        args_parsed: InferSchemaArgsParsed,
-    ) -> Result<ProcessorPtr> {
-        AsyncSourcer::create(ctx.clone(), output, InferSchemaSource {
-            is_finished: false,
-            ctx,
-            args_parsed,
-        })
-    }
-}
-
-#[async_trait::async_trait]
-impl AsyncSource for InferSchemaSource {
-    const NAME: &'static str = INFER_SCHEMA;
-
-    #[async_backtrace::framed]
-    async fn generate(&mut self) -> Result<Option<DataBlock>> {
-        if self.is_finished {
-            return Ok(None);
-        }
-        self.is_finished = true;
-
-        let file_location = if let Some(location) =
-            self.args_parsed.location.clone().strip_prefix('@')
-        {
-            FileLocation::Stage(location.to_string())
-        } else if let Some(connection_name) = &self.args_parsed.connection_name {
-            let conn = self.ctx.get_connection(connection_name).await?;
-            let uri =
-                UriLocation::from_uri(self.args_parsed.location.clone(), conn.storage_params)?;
-            let proto = conn.storage_type.parse::<Scheme>()?;
-            if proto != uri.protocol.parse::<Scheme>()? {
-                return Err(ErrorCode::BadArguments(format!(
-                    "protocol from connection_name={connection_name} ({proto}) not match with uri protocol ({0}).",
-                    uri.protocol
-                )));
-            }
-            FileLocation::Uri(uri)
-        } else {
-            let uri =
-                UriLocation::from_uri(self.args_parsed.location.clone(), BTreeMap::default())?;
-            FileLocation::Uri(uri)
-        };
-        let (stage_info, path) = resolve_file_location(self.ctx.as_ref(), &file_location).await?;
-        let enable_experimental_rbac_check = self
-            .ctx
-            .get_settings()
-            .get_enable_experimental_rbac_check()?;
-        if enable_experimental_rbac_check {
-            let visibility_checker = self
-                .ctx
-                .get_visibility_checker(false, Object::Stage)
-                .await?;
-            if !(stage_info.is_temporary
-                || visibility_checker.check_stage_read_visibility(&stage_info.stage_name)
-                || stage_info.stage_type == StageType::User
-                    && stage_info.stage_name == self.ctx.get_current_user()?.name)
-            {
-                return Err(ErrorCode::PermissionDenied(format!(
-                    "Permission denied: privilege READ is required on stage {} for user {}",
-                    stage_info.stage_name.clone(),
-                    &self.ctx.get_current_user()?.identity().display(),
-                )));
-            }
-        }
-        let files_info = StageFilesInfo {
-            path: path.clone(),
-            ..self.args_parsed.files_info.clone()
-        };
-        let operator = init_stage_operator(&stage_info)?;
-
-        let stage_file_infos = files_info.list(&operator, 1, None).await?;
-        let infer_schema_futures = stage_file_infos.iter().map(|file| async {
-            let file_format_params = match &self.args_parsed.file_format {
-                Some(f) => self.ctx.get_file_format(f).await?,
-                None => stage_info.file_format_params.clone(),
-            };
-            let schema = match file_format_params {
-                FileFormatParams::Csv(params) => {
-                    let escape = if params.escape.is_empty() {
-                        None
-                    } else {
-                        Some(params.escape.as_bytes()[0])
-                    };
-
-                    let mut format = Format::default()
-                        .with_delimiter(params.field_delimiter.as_bytes()[0])
-                        .with_quote(params.quote.as_bytes()[0])
-                        .with_header(params.headers != 0);
-                    if let Some(escape) = escape {
-                        format = format.with_escape(escape);
-                    }
-
-                    read_metadata_async(
-                        &file.path,
-                        &operator,
-                        Some(file.size),
-                        self.args_parsed.max_records,
-                        |reader, max_record| format.infer_schema(reader, max_record).map_err(Some),
-                    )
-                    .await?
-                }
-                FileFormatParams::NdJson(_) => {
-                    read_metadata_async(
-                        &file.path,
-                        &operator,
-                        Some(file.size),
-                        self.args_parsed.max_records,
-                        |reader, max_record| {
-                            let mut records = ValueIter::new(reader, max_record);
-
-                            let schema = if let Some(max_record) = max_record {
-                                let mut tmp: Vec<std::result::Result<_, ArrowError>> =
-                                    Vec::with_capacity(max_record);
-
-                                for result in records {
-                                    tmp.push(Ok(result.map_err(|_| None)?));
-                                }
-                                infer_json_schema_from_iterator(tmp.into_iter()).map_err(Some)?
-                            } else {
-                                infer_json_schema_from_iterator(&mut records).map_err(Some)?
-                            };
-
-                            Ok((schema, 0))
-                        },
-                    )
-                    .await?
-                }
-                FileFormatParams::Parquet(_) => {
-                    read_parquet_schema_async_rs(&operator, &file.path, Some(file.size)).await?
-                }
-                _ => {
-                    return Err(ErrorCode::BadArguments(
-                        "infer_schema is currently limited to format Parquet, CSV and NDJSON",
-                    ));
-                }
-            };
-            Ok(schema)
-        });
-        let arrow_schema = Schema::try_merge(try_join_all(infer_schema_futures).await?)?;
-        let table_schema = TableSchema::try_from(&arrow_schema)?;
-
-        let mut names: Vec<String> = vec![];
-        let mut types: Vec<String> = vec![];
-        let mut nulls: Vec<bool> = vec![];
-
-        for field in table_schema.fields().iter() {
-            names.push(field.name().to_string());
-
-            let non_null_type = field.data_type().remove_recursive_nullable();
-            types.push(non_null_type.sql_name());
-            nulls.push(field.is_nullable());
-        }
-
-        let order_ids = (0..table_schema.fields().len() as u64).collect::<Vec<_>>();
-
-        let block = DataBlock::new_from_columns(vec![
-            StringType::from_data(names),
-            StringType::from_data(types),
-            BooleanType::from_data(nulls),
-            UInt64Type::from_data(order_ids),
-        ]);
-        Ok(Some(block))
-    }
-}
-
-pub async fn read_metadata_async<
-    F: Fn(
-        Cursor<&[u8]>,
-        Option<usize>,
-    ) -> std::result::Result<(ArrowSchema, usize), Option<ArrowError>>,
->(
-    path: &str,
-    operator: &Operator,
-    file_size: Option<u64>,
-    max_records: Option<usize>,
-    func_infer_schema: F,
-) -> Result<ArrowSchema> {
-    let file_size = match file_size {
-        None => operator.stat(path).await?.content_length(),
-        Some(n) => n,
-    };
-    let algo = CompressAlgorithm::from_path(path);
-
-    let mut buf = Vec::new();
-    let mut offset: u64 = 0;
-    let mut chunk_size: u64 =
-        if max_records.is_none() || matches!(algo, Some(CompressAlgorithm::Zip)) {
-            file_size
-        } else {
-            DEFAULT_BYTES
-        };
-
-    loop {
-        let end = cmp::min(offset + chunk_size, file_size);
-
-        let chunk = operator.read_with(path).range(offset..end).await?;
-        buf.put(chunk);
-
-        offset = end;
-
-        let bytes = if let Some(algo) = algo {
-            let decompress_bytes = if CompressAlgorithm::Zip == algo {
-                DecompressDecoder::decompress_all_zip(&buf)?
-            } else {
-                DecompressDecoder::new(algo).decompress_batch(&buf)?
-            };
-            Cow::Owned(decompress_bytes)
-        } else {
-            Cow::Borrowed(&buf)
-        };
-
-        if !bytes.is_empty() || offset >= file_size {
-            match func_infer_schema(Cursor::new(bytes.as_slice()), max_records) {
-                Ok((schema, _)) => {
-                    return Ok(schema);
-                }
-                Err(Some(err)) => {
-                    if matches!(err, ArrowError::CsvError(_)) && offset < file_size {
-                        continue;
-                    }
-                    return Err(ErrorCode::from(err));
-                }
-                Err(None) => (),
-            }
-        }
-        chunk_size = cmp::min(chunk_size * 2, file_size - offset);
-    }
-}
diff --git a/src/query/storages/stage/src/infer_schema.rs b/src/query/storages/stage/src/infer_schema.rs
new file mode 100644
index 0000000000000..d986e19ab9736
--- /dev/null
+++ b/src/query/storages/stage/src/infer_schema.rs
@@ -0,0 +1,79 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::hash::{DefaultHasher, Hash, Hasher};
+use std::sync::Arc;
+use databend_common_catalog::plan::{PartInfo, PartInfoPtr, PartInfoType};
+use databend_common_exception::ErrorCode;
+use databend_common_meta_app::principal::{FileFormatParams, StageInfo};
+use databend_common_storage::{StageFileInfo, StageFilesInfo};
+
+#[derive(serde::Serialize, serde::Deserialize, PartialEq, Eq)]
+pub struct InferSchemaPartInfo {
+    pub files_info: StageFilesInfo,
+    pub file_format_params: FileFormatParams,
+    pub stage_info: StageInfo,
+    pub stage_file_infos: Vec<StageFileInfo>,
+    
+}
+
+#[typetag::serde(name = "infer_schema")]
+impl PartInfo for InferSchemaPartInfo {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn equals(&self, info: &Box<dyn PartInfo>) -> bool {
+        info.as_any()
+            .downcast_ref::<InferSchemaPartInfo>()
+            .is_some_and(|other| self == other)
+    }
+
+    fn hash(&self) -> u64 {
+        let mut s = DefaultHasher::new();
+        self.files_info.hash(&mut s);
+        self.file_format_params.hash(&mut s);
+        self.stage_info.hash(&mut s);
+        s.finish()
+    }
+
+    fn part_type(&self) -> PartInfoType {
+        PartInfoType::LazyLevel
+    }
+}
+
+impl InferSchemaPartInfo {
+    pub fn create(
+        files_info: StageFilesInfo,
+        file_format_params: FileFormatParams,
+        stage_info: StageInfo,
+        stage_file_infos: Vec<StageFileInfo>,
+    ) -> PartInfoPtr {
+        Arc::new(Box::new(InferSchemaPartInfo {
+            files_info,
+            file_format_params,
+            stage_info,
+            stage_file_infos,
+        }))
+    }
+
+    pub fn from_part(info: &PartInfoPtr) -> databend_common_exception::Result<&InferSchemaPartInfo> {
+        info.as_any()
+            .downcast_ref::<InferSchemaPartInfo>()
+            .ok_or_else(|| {
+                ErrorCode::Internal("Cannot downcast from PartInfo to InferSchemaPartInfo.")
+            })
+    }
+}
\ No newline at end of file
diff --git a/src/query/storages/stage/src/lib.rs b/src/query/storages/stage/src/lib.rs
index 96573e42f4af2..e52a1ab93215b 100644
--- a/src/query/storages/stage/src/lib.rs
+++ b/src/query/storages/stage/src/lib.rs
@@ -26,6 +26,7 @@ mod stage_table;
 mod streaming_load;
 mod transform_generating;
 mod transform_null_if;
+mod infer_schema;
 
 pub use append::StageSinkTable;
 pub use compression::get_compression_with_path;
@@ -33,3 +34,7 @@ pub use read::row_based::BytesBatch;
 pub use stage_table::StageTable;
 pub use streaming_load::build_streaming_load_pipeline;
 pub use transform_null_if::TransformNullIf;
+pub use infer_schema::InferSchemaPartInfo;
+pub use read::row_based::BytesReader;
+pub use read::row_based::Decompressor;
+pub use read::LoadContext;
diff --git a/src/query/storages/stage/src/read/mod.rs b/src/query/storages/stage/src/read/mod.rs
index 98f677c51fbde..984e6c62f78a5 100644
--- a/src/query/storages/stage/src/read/mod.rs
+++ b/src/query/storages/stage/src/read/mod.rs
@@ -20,3 +20,4 @@ pub mod row_based;
 pub(crate) mod block_builder_state;
 mod default_expr_evaluator;
 pub(crate) mod whole_file_reader;
+pub use load_context::LoadContext;
diff --git a/src/query/storages/stage/src/read/row_based/mod.rs b/src/query/storages/stage/src/read/row_based/mod.rs
index a630ae43fabfc..409236777b951 100644
--- a/src/query/storages/stage/src/read/row_based/mod.rs
+++ b/src/query/storages/stage/src/read/row_based/mod.rs
@@ -21,3 +21,5 @@ mod utils;
 
 pub use batch::BytesBatch;
 pub use read_pipeline::RowBasedReadPipelineBuilder;
+pub use processors::BytesReader;
+pub use processors::Decompressor;
diff --git a/tests/data/csv/max_records.csv.xz b/tests/data/csv/max_records.xz
similarity index 100%
rename from tests/data/csv/max_records.csv.xz
rename to tests/data/csv/max_records.xz
diff --git a/tests/data/csv/max_records.csv.zst b/tests/data/csv/max_records.zst
similarity index 100%
rename from tests/data/csv/max_records.csv.zst
rename to tests/data/csv/max_records.zst
diff --git a/tests/data/ndjson/max_records.ndjson.xz b/tests/data/ndjson/max_records.xz
similarity index 100%
rename from tests/data/ndjson/max_records.ndjson.xz
rename to tests/data/ndjson/max_records.xz
diff --git a/tests/data/ndjson/max_records.ndjson.zst b/tests/data/ndjson/max_records.zst
similarity index 100%
rename from tests/data/ndjson/max_records.ndjson.zst
rename to tests/data/ndjson/max_records.zst
diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index f5c0cc04546cf..2629bd5a0f351 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -105,7 +105,7 @@ id BIGINT 1 0
 value BIGINT 1 1
 
 query TTBI
-select * from infer_schema(location => '@data/csv/max_records.csv.zst', file_format => 'head_csv_format', max_records_pre_file => 5);
+select * from infer_schema(location => '@data/csv/max_records.zst', file_format => 'head_csv_format', max_records_pre_file => 5);
 ----
 id BIGINT 1 0
 value BIGINT 1 1
@@ -117,7 +117,7 @@ id BIGINT 1 0
 value BIGINT 1 1
 
 query TTBI
-select * from infer_schema(location => '@data/csv/max_records.csv.xz', file_format => 'head_csv_format', max_records_pre_file => 5);
+select * from infer_schema(location => '@data/csv/max_records.xz', file_format => 'head_csv_format', max_records_pre_file => 5);
 ----
 id BIGINT 1 0
 value BIGINT 1 1
@@ -138,6 +138,11 @@ utf8_col VARCHAR 1 8
 query TTBI
 select * from infer_schema(location => '@data/csv/merge/', file_format => 'head_csv_format');
 ----
+col1 VARCHAR 1 0
+col2 VARCHAR 1 1
+col3 VARCHAR 1 2
+col4 VARCHAR 1 3
+col5 VARCHAR 1 4
 
 # NDJSON
 query TTBI
@@ -172,7 +177,7 @@ id BIGINT 1 0
 value BIGINT 1 1
 
 query TTBI
-select * from infer_schema(location => '@data/ndjson/max_records.ndjson.zst', file_format => 'NDJSON', max_records_pre_file => 5);
+select * from infer_schema(location => '@data/ndjson/max_records.zst', file_format => 'NDJSON', max_records_pre_file => 5);
 ----
 id BIGINT 1 0
 value BIGINT 1 1
@@ -184,7 +189,7 @@ id BIGINT 1 0
 value BIGINT 1 1
 
 query TTBI
-select * from infer_schema(location => '@data/ndjson/max_records.ndjson.xz', file_format => 'NDJSON', max_records_pre_file => 5)
+select * from infer_schema(location => '@data/ndjson/max_records.xz', file_format => 'NDJSON', max_records_pre_file => 5)
 ----
 id BIGINT 1 0
 value BIGINT 1 1
@@ -207,3 +212,8 @@ obj_col TUPLE(A INT64, B STRING) 1 10
 query TTBI
 select * from infer_schema(location => '@data/ndjson/merge/', file_format => 'NDJSON');
 ----
+col1 VARCHAR 1 0
+col2 VARCHAR 1 1
+col3 VARCHAR 1 2
+col4 VARCHAR 1 3
+col5 VARCHAR 1 4

From 178aacffd322859108e430de82dd0fe5c3bc16e1 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Thu, 4 Sep 2025 17:53:52 +0800
Subject: [PATCH 15/20] feat: InferSeparator multi-file processing and Schema
 promote merging type

---
 src/query/ast/src/ast/statements/copy.rs      |   4 +-
 .../infer_schema/infer_schema_table.rs        |  42 ++-
 .../src/table_functions/infer_schema/merge.rs | 284 ++++++++++++++++++
 .../src/table_functions/infer_schema/mod.rs   |   3 +-
 .../table_functions/infer_schema/parquet.rs   |   4 +-
 .../table_functions/infer_schema/separator.rs |  80 +++--
 src/query/storages/stage/src/infer_schema.rs  |  22 +-
 src/query/storages/stage/src/lib.rs           |  10 +-
 .../storages/stage/src/read/row_based/mod.rs  |   2 +-
 .../stage/formats/parquet/infer_schema.test   |  10 +-
 10 files changed, 410 insertions(+), 51 deletions(-)
 create mode 100644 src/query/service/src/table_functions/infer_schema/merge.rs

diff --git a/src/query/ast/src/ast/statements/copy.rs b/src/query/ast/src/ast/statements/copy.rs
index 7712581b32880..b207b05d7d879 100644
--- a/src/query/ast/src/ast/statements/copy.rs
+++ b/src/query/ast/src/ast/statements/copy.rs
@@ -648,7 +648,9 @@ impl Display for FileFormatValue {
     }
 }
 
-#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq, Hash)]
+#[derive(
+    serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq, Hash,
+)]
 pub enum OnErrorMode {
     Continue,
     SkipFileNum(u64),
diff --git a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
index 1b90dd139d1d1..fd4ae9c947fcd 100644
--- a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
+++ b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
@@ -18,17 +18,21 @@ use std::sync::Arc;
 
 use databend_common_ast::ast::FileLocation;
 use databend_common_ast::ast::UriLocation;
-use databend_common_catalog::plan::{DataSourcePlan, PartInfo, StageTableInfo};
+use databend_common_catalog::plan::DataSourcePlan;
+use databend_common_catalog::plan::PartInfo;
 use databend_common_catalog::plan::PartStatistics;
 use databend_common_catalog::plan::Partitions;
 use databend_common_catalog::plan::PartitionsShuffleKind;
 use databend_common_catalog::plan::PushDownInfo;
+use databend_common_catalog::plan::StageTableInfo;
 use databend_common_catalog::table::Table;
 use databend_common_catalog::table_args::TableArgs;
+use databend_common_compress::CompressAlgorithm;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::types::NumberDataType;
-use databend_common_expression::{BlockThresholds, TableDataType};
+use databend_common_expression::BlockThresholds;
+use databend_common_expression::TableDataType;
 use databend_common_expression::TableField;
 use databend_common_expression::TableSchema;
 use databend_common_expression::TableSchemaRefExt;
@@ -40,16 +44,18 @@ use databend_common_meta_app::schema::TableInfo;
 use databend_common_meta_app::schema::TableMeta;
 use databend_common_pipeline_core::Pipeline;
 use databend_common_pipeline_sources::PrefetchAsyncSourcer;
+use databend_common_pipeline_transforms::TransformPipelineHelper;
 use databend_common_sql::binder::resolve_file_location;
 use databend_common_storage::init_stage_operator;
 use databend_common_storage::StageFilesInfo;
-use databend_common_storages_stage::{BytesReader, Decompressor, LoadContext};
+use databend_common_storages_stage::BytesReader;
+use databend_common_storages_stage::Decompressor;
 use databend_common_storages_stage::InferSchemaPartInfo;
+use databend_common_storages_stage::LoadContext;
 use databend_common_users::Object;
-use opendal::Scheme;
-use databend_common_compress::CompressAlgorithm;
-use databend_common_pipeline_transforms::TransformPipelineHelper;
 use databend_storages_common_stage::SingleFilePartition;
+use opendal::Scheme;
+
 use super::parquet::ParquetInferSchemaSource;
 use crate::sessions::TableContext;
 use crate::table_functions::infer_schema::separator::InferSchemaSeparator;
@@ -187,7 +193,12 @@ impl Table for InferSchemaTable {
         Ok((
             PartStatistics::default(),
             Partitions::create(PartitionsShuffleKind::Seq, vec![
-                InferSchemaPartInfo::create(files_info, file_format_params, stage_info, stage_file_infos),
+                InferSchemaPartInfo::create(
+                    files_info,
+                    file_format_params,
+                    stage_info,
+                    stage_file_infos,
+                ),
             ]),
         ))
     }
@@ -210,7 +221,8 @@ impl Table for InferSchemaTable {
 
         match info.file_format_params {
             FileFormatParams::Csv(_) | FileFormatParams::NdJson(_) => {
-                let partitions = info.stage_file_infos
+                let partitions = info
+                    .stage_file_infos
                     .iter()
                     .map(|v| {
                         let part = SingleFilePartition {
@@ -248,11 +260,15 @@ impl Table for InferSchemaTable {
                 let mut algo = None;
 
                 for file_info in info.stage_file_infos.iter() {
-                    let Some(new_algo) = CompressAlgorithm::from_path(&file_info.path) else { continue };
+                    let Some(new_algo) = CompressAlgorithm::from_path(&file_info.path) else {
+                        continue;
+                    };
 
                     if let Some(algo) = algo {
                         if algo != new_algo {
-                            return Err(ErrorCode::UnknownCompressionType("`infer_schema` only supports single compression type"));
+                            return Err(ErrorCode::UnknownCompressionType(
+                                "`infer_schema` only supports single compression type",
+                            ));
                         }
                     }
                     algo = Some(new_algo);
@@ -263,7 +279,11 @@ impl Table for InferSchemaTable {
                     })?;
                 }
                 pipeline.add_accumulating_transformer(|| {
-                   InferSchemaSeparator::create(info.file_format_params.clone(), self.args_parsed.max_records)
+                    InferSchemaSeparator::create(
+                        info.file_format_params.clone(),
+                        self.args_parsed.max_records,
+                        info.stage_file_infos.len(),
+                    )
                 });
             }
             FileFormatParams::Parquet(_) => {
diff --git a/src/query/service/src/table_functions/infer_schema/merge.rs b/src/query/service/src/table_functions/infer_schema/merge.rs
new file mode 100644
index 0000000000000..1b441f6a68b24
--- /dev/null
+++ b/src/query/service/src/table_functions/infer_schema/merge.rs
@@ -0,0 +1,284 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use databend_common_expression::types::NumberDataType;
+use databend_common_expression::TableDataType;
+use databend_common_expression::TableSchema;
+
+const UNSIGNED_TYPES: [NumberDataType; 4] = [
+    NumberDataType::UInt8,
+    NumberDataType::UInt16,
+    NumberDataType::UInt32,
+    NumberDataType::UInt64,
+];
+const SIGNED_TYPES: [NumberDataType; 4] = [
+    NumberDataType::Int8,
+    NumberDataType::Int16,
+    NumberDataType::Int32,
+    NumberDataType::Int64,
+];
+const FLOAT_TYPES: [NumberDataType; 2] = [NumberDataType::Float32, NumberDataType::Float64];
+
+fn wrap_nullable(ty: TableDataType, is_nullable: bool) -> TableDataType {
+    if is_nullable {
+        ty.wrap_nullable()
+    } else {
+        ty
+    }
+}
+
+pub fn merge_type(
+    old: TableDataType,
+    new: TableDataType,
+    is_nullable: bool,
+) -> Option<TableDataType> {
+    if old.remove_nullable() == new.remove_nullable() {
+        return Some(wrap_nullable(old, is_nullable));
+    }
+    if let (TableDataType::Number(old_num), TableDataType::Number(new_num)) =
+        (new.remove_nullable(), old.remove_nullable())
+    {
+        if old_num.is_float() && new_num.is_float() {
+            return promote_numeric(&old, &new, &FLOAT_TYPES)
+                .map(|ty| wrap_nullable(ty, is_nullable));
+        }
+        return promote_numeric(&old, &new, &SIGNED_TYPES)
+            .or_else(|| promote_numeric(&old, &new, &UNSIGNED_TYPES))
+            .map(|ty| wrap_nullable(ty, is_nullable));
+    }
+    None
+}
+
+pub fn promote_numeric(
+    a: &TableDataType,
+    b: &TableDataType,
+    types: &[NumberDataType],
+) -> Option<TableDataType> {
+    let idx_a = match a {
+        TableDataType::Number(n) => types.iter().position(|t| t == n),
+        _ => None,
+    };
+    let idx_b = match b {
+        TableDataType::Number(n) => types.iter().position(|t| t == n),
+        _ => None,
+    };
+    match (idx_a, idx_b) {
+        (Some(i), Some(j)) => Some(TableDataType::Number(types[usize::max(i, j)].clone())),
+        _ => None,
+    }
+}
+
+pub fn merge_schema(defined: TableSchema, guess: TableSchema) -> TableSchema {
+    let TableSchema {
+        fields: mut def_fields,
+        ..
+    } = defined;
+    let TableSchema {
+        fields: guess_fields,
+        ..
+    } = guess;
+
+    for guess_field in guess_fields {
+        match def_fields
+            .iter_mut()
+            .find(|def_field| def_field.name() == guess_field.name())
+        {
+            None => {
+                def_fields.push(guess_field);
+            }
+            Some(def_field) => {
+                let is_nullable =
+                    def_field.data_type.is_nullable() || guess_field.data_type.is_nullable();
+                def_field.data_type = merge_type(
+                    def_field.data_type.clone(),
+                    guess_field.data_type,
+                    is_nullable,
+                )
+                .unwrap_or_else(|| wrap_nullable(TableDataType::String, is_nullable));
+            }
+        }
+    }
+
+    TableSchema::new(def_fields)
+}
+
+#[cfg(test)]
+mod tests {
+    use databend_common_expression::types::NumberDataType;
+    use databend_common_expression::TableDataType;
+    use databend_common_expression::TableField;
+    use databend_common_expression::TableSchema;
+
+    use crate::table_functions::infer_schema::merge::merge_schema;
+    use crate::table_functions::infer_schema::merge::merge_type;
+
+    #[test]
+    fn test_promote_unsigned() {
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::UInt8),
+                TableDataType::Number(NumberDataType::UInt16),
+                false,
+            ),
+            Some(TableDataType::Number(NumberDataType::UInt16))
+        );
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::UInt32),
+                TableDataType::Number(NumberDataType::UInt64),
+                false,
+            ),
+            Some(TableDataType::Number(NumberDataType::UInt64))
+        );
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::UInt8),
+                TableDataType::Number(NumberDataType::Int8),
+                false,
+            ),
+            None
+        );
+    }
+
+    #[test]
+    fn test_promote_signed() {
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::Int8),
+                TableDataType::Number(NumberDataType::Int16),
+                false,
+            ),
+            Some(TableDataType::Number(NumberDataType::Int16))
+        );
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::Int32),
+                TableDataType::Number(NumberDataType::Int64),
+                false,
+            ),
+            Some(TableDataType::Number(NumberDataType::Int64))
+        );
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::Int8),
+                TableDataType::Number(NumberDataType::UInt8),
+                false,
+            ),
+            None
+        );
+    }
+
+    #[test]
+    fn test_promote_integer() {
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::Int8),
+                TableDataType::Number(NumberDataType::Int16),
+                false,
+            ),
+            Some(TableDataType::Number(NumberDataType::Int16))
+        );
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::UInt8),
+                TableDataType::Number(NumberDataType::UInt32),
+                false,
+            ),
+            Some(TableDataType::Number(NumberDataType::UInt32))
+        );
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::Int8),
+                TableDataType::Number(NumberDataType::UInt8),
+                false,
+            ),
+            None
+        );
+    }
+
+    #[test]
+    fn test_promote_float() {
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::Float32),
+                TableDataType::Number(NumberDataType::Float64),
+                false,
+            ),
+            Some(TableDataType::Number(NumberDataType::Float64))
+        );
+    }
+
+    #[test]
+    fn test_promote_numeric() {
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::Int8),
+                TableDataType::Number(NumberDataType::Int16),
+                false,
+            ),
+            Some(TableDataType::Number(NumberDataType::Int16))
+        );
+        assert_eq!(
+            merge_type(
+                TableDataType::Number(NumberDataType::Float32),
+                TableDataType::Number(NumberDataType::Int16),
+                false,
+            ),
+            None
+        );
+        assert_eq!(
+            merge_type(
+                TableDataType::String,
+                TableDataType::Number(NumberDataType::Int32),
+                false,
+            ),
+            None
+        );
+    }
+
+    #[test]
+    fn test_merge_schema() {
+        let schema_1 = TableSchema::new(vec![
+            TableField::new(
+                "c1",
+                TableDataType::Nullable(Box::new(TableDataType::Number(NumberDataType::Int8))),
+            ),
+            TableField::new("c2", TableDataType::Number(NumberDataType::Int8)),
+            TableField::new("c3", TableDataType::Number(NumberDataType::Int32)),
+            TableField::new("c4", TableDataType::Number(NumberDataType::Float32)),
+            TableField::new("c5", TableDataType::Number(NumberDataType::Float32)),
+        ]);
+        let schema_2 = TableSchema::new(vec![
+            TableField::new("c1", TableDataType::Number(NumberDataType::Int8)),
+            TableField::new("c3", TableDataType::Number(NumberDataType::Float32)),
+            TableField::new("c2", TableDataType::Number(NumberDataType::Int8)),
+            TableField::new("c4", TableDataType::Number(NumberDataType::Float32)),
+            TableField::new("c6", TableDataType::Number(NumberDataType::Float32)),
+        ]);
+
+        let schema = merge_schema(schema_1, schema_2);
+        let expected_schema = TableSchema::new(vec![
+            TableField::new(
+                "c1",
+                TableDataType::Nullable(Box::new(TableDataType::Number(NumberDataType::Int8))),
+            ),
+            TableField::new("c2", TableDataType::Number(NumberDataType::Int8)),
+            TableField::new("c3", TableDataType::String),
+            TableField::new("c4", TableDataType::Number(NumberDataType::Float32)),
+            TableField::new("c5", TableDataType::Number(NumberDataType::Float32)),
+            TableField::new("c6", TableDataType::Number(NumberDataType::Float32)),
+        ]);
+        assert_eq!(schema, expected_schema);
+    }
+}
diff --git a/src/query/service/src/table_functions/infer_schema/mod.rs b/src/query/service/src/table_functions/infer_schema/mod.rs
index 3009bdfa92daa..82597e959806b 100644
--- a/src/query/service/src/table_functions/infer_schema/mod.rs
+++ b/src/query/service/src/table_functions/infer_schema/mod.rs
@@ -13,8 +13,9 @@
 // limitations under the License.
 
 mod infer_schema_table;
+mod merge;
 mod parquet;
-mod table_args;
 mod separator;
+mod table_args;
 
 pub use infer_schema_table::InferSchemaTable;
diff --git a/src/query/service/src/table_functions/infer_schema/parquet.rs b/src/query/service/src/table_functions/infer_schema/parquet.rs
index ebe13d80d434b..38e15819d0eb7 100644
--- a/src/query/service/src/table_functions/infer_schema/parquet.rs
+++ b/src/query/service/src/table_functions/infer_schema/parquet.rs
@@ -28,8 +28,9 @@ use databend_common_pipeline_core::processors::OutputPort;
 use databend_common_pipeline_core::processors::ProcessorPtr;
 use databend_common_pipeline_sources::AsyncSource;
 use databend_common_pipeline_sources::AsyncSourcer;
-use databend_common_storage::{init_stage_operator, StageFileInfo};
+use databend_common_storage::init_stage_operator;
 use databend_common_storage::read_parquet_schema_async_rs;
+use databend_common_storage::StageFileInfo;
 use futures_util::future::try_join_all;
 
 use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA;
@@ -71,6 +72,7 @@ impl AsyncSource for ParquetInferSchemaSource {
         let infer_schema_futures = self.stage_file_infos.iter().map(|file| async {
             read_parquet_schema_async_rs(&operator, &file.path, Some(file.size)).await
         });
+        // todo: unify_schemas(arrow-rs unsupported now)
         let arrow_schema = Schema::try_merge(try_join_all(infer_schema_futures).await?)?;
         let table_schema = TableSchema::try_from(&arrow_schema)?;
 
diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs
index 24f838d4f9cb7..68335bf4a930c 100644
--- a/src/query/service/src/table_functions/infer_schema/separator.rs
+++ b/src/query/service/src/table_functions/infer_schema/separator.rs
@@ -12,30 +12,50 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::collections::HashMap;
 use std::io::Cursor;
+
 use arrow_csv::reader::Format;
-use arrow_json::reader::{infer_json_schema_from_iterator, ValueIter};
-use arrow_schema::{ArrowError, Schema};
-use databend_common_expression::{BlockMetaInfoDowncast, DataBlock, FromData, TableSchema};
-use databend_common_pipeline_transforms::AccumulatingTransform;
-use databend_common_exception::{ErrorCode, Result};
-use databend_common_expression::types::{BooleanType, StringType, UInt64Type};
+use arrow_json::reader::infer_json_schema_from_iterator;
+use arrow_json::reader::ValueIter;
+use arrow_schema::ArrowError;
+use arrow_schema::Schema;
+use databend_common_exception::ErrorCode;
+use databend_common_exception::Result;
+use databend_common_expression::types::BooleanType;
+use databend_common_expression::types::StringType;
+use databend_common_expression::types::UInt64Type;
+use databend_common_expression::BlockMetaInfoDowncast;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FromData;
+use databend_common_expression::TableSchema;
 use databend_common_meta_app::principal::FileFormatParams;
+use databend_common_pipeline_transforms::AccumulatingTransform;
 use databend_common_storages_stage::BytesBatch;
 
+use crate::table_functions::infer_schema::merge::merge_schema;
+
 pub struct InferSchemaSeparator {
     pub file_format_params: FileFormatParams,
-    pub bytes_buf: Vec<u8>,
+    files: HashMap<String, Vec<u8>>,
     pub max_records: Option<usize>,
+    schemas: Vec<Schema>,
+    files_len: usize,
     is_finished: bool,
 }
 
 impl InferSchemaSeparator {
-    pub fn create(file_format_params: FileFormatParams, max_records: Option<usize>) -> Self {
+    pub fn create(
+        file_format_params: FileFormatParams,
+        max_records: Option<usize>,
+        files_len: usize,
+    ) -> Self {
         InferSchemaSeparator {
             file_format_params,
-            bytes_buf: vec![],
+            files: HashMap::new(),
             max_records,
+            schemas: Vec::with_capacity(files_len),
+            files_len,
             is_finished: false,
         }
     }
@@ -52,13 +72,15 @@ impl AccumulatingTransform for InferSchemaSeparator {
             .get_owned_meta()
             .and_then(BytesBatch::downcast_from)
             .unwrap();
-        self.bytes_buf.extend(batch.data);
+
+        let bytes = self.files.entry(batch.path.clone()).or_insert(Vec::new());
+        bytes.extend(batch.data);
 
         // When max_records exists, it will try to use the current bytes to read, otherwise it will buffer all bytes
         if self.max_records.is_none() && !batch.is_eof {
             return Ok(vec![DataBlock::empty()]);
         }
-        let bytes = Cursor::new(&self.bytes_buf);
+        let bytes = Cursor::new(bytes);
         let result = match &self.file_format_params {
             FileFormatParams::Csv(params) => {
                 let escape = if params.escape.is_empty() {
@@ -74,7 +96,10 @@ impl AccumulatingTransform for InferSchemaSeparator {
                 if let Some(escape) = escape {
                     format = format.with_escape(escape);
                 }
-                format.infer_schema(bytes, self.max_records).map(|(schema, _)| schema).map_err(Some)
+                format
+                    .infer_schema(bytes, self.max_records)
+                    .map(|(schema, _)| schema)
+                    .map_err(Some)
             }
             FileFormatParams::NdJson(_) => {
                 let mut records = ValueIter::new(bytes, self.max_records);
@@ -101,19 +126,36 @@ impl AccumulatingTransform for InferSchemaSeparator {
         };
         let arrow_schema = match result {
             Ok(schema) => schema,
-            Err(None) => {
-                return Ok(vec![DataBlock::empty()])
-            }
+            Err(None) => return Ok(vec![DataBlock::empty()]),
             Err(Some(err)) => {
-                if matches!(err, ArrowError::CsvError(_)) && self.max_records.is_some() && !batch.is_eof {
+                if matches!(err, ArrowError::CsvError(_))
+                    && self.max_records.is_some()
+                    && !batch.is_eof
+                {
                     return Ok(vec![DataBlock::empty()]);
                 }
                 return Err(err.into());
             }
         };
-        self.is_finished = true;
+        self.files.remove(&batch.path);
+        self.schemas.push(arrow_schema);
 
-        let table_schema = TableSchema::try_from(&arrow_schema)?;
+        if self.schemas.len() != self.files_len {
+            return Ok(vec![DataBlock::empty()]);
+        }
+        self.is_finished = true;
+        if self.schemas.len() == 0 {
+            return Ok(vec![DataBlock::empty()]);
+        }
+        let table_schema = if self.schemas.len() == 1 {
+            TableSchema::try_from(&self.schemas.pop().unwrap())?
+        } else {
+            self.schemas[1..]
+                .iter()
+                .try_fold(TableSchema::try_from(&self.schemas[0])?, |acc, schema| {
+                    TableSchema::try_from(schema).map(|schema| merge_schema(acc, schema))
+                })?
+        };
 
         let mut names: Vec<String> = vec![];
         let mut types: Vec<String> = vec![];
@@ -137,4 +179,4 @@ impl AccumulatingTransform for InferSchemaSeparator {
         ]);
         Ok(vec![block])
     }
-}
\ No newline at end of file
+}
diff --git a/src/query/storages/stage/src/infer_schema.rs b/src/query/storages/stage/src/infer_schema.rs
index d986e19ab9736..5a3c25a6c4910 100644
--- a/src/query/storages/stage/src/infer_schema.rs
+++ b/src/query/storages/stage/src/infer_schema.rs
@@ -13,12 +13,19 @@
 // limitations under the License.
 
 use std::any::Any;
-use std::hash::{DefaultHasher, Hash, Hasher};
+use std::hash::DefaultHasher;
+use std::hash::Hash;
+use std::hash::Hasher;
 use std::sync::Arc;
-use databend_common_catalog::plan::{PartInfo, PartInfoPtr, PartInfoType};
+
+use databend_common_catalog::plan::PartInfo;
+use databend_common_catalog::plan::PartInfoPtr;
+use databend_common_catalog::plan::PartInfoType;
 use databend_common_exception::ErrorCode;
-use databend_common_meta_app::principal::{FileFormatParams, StageInfo};
-use databend_common_storage::{StageFileInfo, StageFilesInfo};
+use databend_common_meta_app::principal::FileFormatParams;
+use databend_common_meta_app::principal::StageInfo;
+use databend_common_storage::StageFileInfo;
+use databend_common_storage::StageFilesInfo;
 
 #[derive(serde::Serialize, serde::Deserialize, PartialEq, Eq)]
 pub struct InferSchemaPartInfo {
@@ -26,7 +33,6 @@ pub struct InferSchemaPartInfo {
     pub file_format_params: FileFormatParams,
     pub stage_info: StageInfo,
     pub stage_file_infos: Vec<StageFileInfo>,
-    
 }
 
 #[typetag::serde(name = "infer_schema")]
@@ -69,11 +75,13 @@ impl InferSchemaPartInfo {
         }))
     }
 
-    pub fn from_part(info: &PartInfoPtr) -> databend_common_exception::Result<&InferSchemaPartInfo> {
+    pub fn from_part(
+        info: &PartInfoPtr,
+    ) -> databend_common_exception::Result<&InferSchemaPartInfo> {
         info.as_any()
             .downcast_ref::<InferSchemaPartInfo>()
             .ok_or_else(|| {
                 ErrorCode::Internal("Cannot downcast from PartInfo to InferSchemaPartInfo.")
             })
     }
-}
\ No newline at end of file
+}
diff --git a/src/query/storages/stage/src/lib.rs b/src/query/storages/stage/src/lib.rs
index e52a1ab93215b..39e7392165464 100644
--- a/src/query/storages/stage/src/lib.rs
+++ b/src/query/storages/stage/src/lib.rs
@@ -21,20 +21,20 @@
 
 mod append;
 mod compression;
+mod infer_schema;
 mod read;
 mod stage_table;
 mod streaming_load;
 mod transform_generating;
 mod transform_null_if;
-mod infer_schema;
 
 pub use append::StageSinkTable;
 pub use compression::get_compression_with_path;
-pub use read::row_based::BytesBatch;
-pub use stage_table::StageTable;
-pub use streaming_load::build_streaming_load_pipeline;
-pub use transform_null_if::TransformNullIf;
 pub use infer_schema::InferSchemaPartInfo;
+pub use read::row_based::BytesBatch;
 pub use read::row_based::BytesReader;
 pub use read::row_based::Decompressor;
 pub use read::LoadContext;
+pub use stage_table::StageTable;
+pub use streaming_load::build_streaming_load_pipeline;
+pub use transform_null_if::TransformNullIf;
diff --git a/src/query/storages/stage/src/read/row_based/mod.rs b/src/query/storages/stage/src/read/row_based/mod.rs
index 409236777b951..69e24b53db07a 100644
--- a/src/query/storages/stage/src/read/row_based/mod.rs
+++ b/src/query/storages/stage/src/read/row_based/mod.rs
@@ -20,6 +20,6 @@ mod read_pipeline;
 mod utils;
 
 pub use batch::BytesBatch;
-pub use read_pipeline::RowBasedReadPipelineBuilder;
 pub use processors::BytesReader;
 pub use processors::Decompressor;
+pub use read_pipeline::RowBasedReadPipelineBuilder;
diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index 2629bd5a0f351..558617d91adc0 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -61,11 +61,11 @@ drop CONNECTION IF EXISTS my_conn
 statement ok
 create CONNECTION my_conn STORAGE_TYPE = 's3' access_key_id='minioadmin' secret_access_key='minioadmin' endpoint_url='http://127.0.0.1:9900/' region='auto'
 
-# query
-# select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn')
-# ----
-# id INT 0 0
-# t TUPLE(A INT32, B STRING) 0 1
+query
+select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn')
+----
+id INT 0 0
+t TUPLE(A INT32, B STRING) 0 1
 
 # CSV
 statement ok

From 4bd26e5476439065abe4ac1e972457a3539729cb Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Thu, 4 Sep 2025 23:24:29 +0800
Subject: [PATCH 16/20] chore: codefmt

---
 src/query/service/src/table_functions/infer_schema/merge.rs   | 2 +-
 .../service/src/table_functions/infer_schema/separator.rs     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/query/service/src/table_functions/infer_schema/merge.rs b/src/query/service/src/table_functions/infer_schema/merge.rs
index 1b441f6a68b24..5aa78e263a7cb 100644
--- a/src/query/service/src/table_functions/infer_schema/merge.rs
+++ b/src/query/service/src/table_functions/infer_schema/merge.rs
@@ -74,7 +74,7 @@ pub fn promote_numeric(
         _ => None,
     };
     match (idx_a, idx_b) {
-        (Some(i), Some(j)) => Some(TableDataType::Number(types[usize::max(i, j)].clone())),
+        (Some(i), Some(j)) => Some(TableDataType::Number(types[usize::max(i, j)])),
         _ => None,
     }
 }
diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs
index 68335bf4a930c..a485d72356003 100644
--- a/src/query/service/src/table_functions/infer_schema/separator.rs
+++ b/src/query/service/src/table_functions/infer_schema/separator.rs
@@ -73,7 +73,7 @@ impl AccumulatingTransform for InferSchemaSeparator {
             .and_then(BytesBatch::downcast_from)
             .unwrap();
 
-        let bytes = self.files.entry(batch.path.clone()).or_insert(Vec::new());
+        let bytes = self.files.entry(batch.path.clone()).or_default();
         bytes.extend(batch.data);
 
         // When max_records exists, it will try to use the current bytes to read, otherwise it will buffer all bytes
@@ -144,7 +144,7 @@ impl AccumulatingTransform for InferSchemaSeparator {
             return Ok(vec![DataBlock::empty()]);
         }
         self.is_finished = true;
-        if self.schemas.len() == 0 {
+        if self.schemas.is_empty() {
             return Ok(vec![DataBlock::empty()]);
         }
         let table_schema = if self.schemas.len() == 1 {

From fb7fd0e74d000cf69a5abe2493c69b1d33ff695a Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Tue, 9 Sep 2025 12:10:34 +0800
Subject: [PATCH 17/20] feat: impl `max_file_count` for `infer_schema`

---
 .../infer_schema/infer_schema_table.rs        |  4 ++-
 .../table_functions/infer_schema/separator.rs | 29 +++++++++----------
 .../infer_schema/table_args.rs                |  6 ++++
 tests/data/csv/max_file_count/numbers0.csv    |  5 ++++
 tests/data/csv/max_file_count/numbers1.csv    |  4 +++
 tests/data/csv/max_file_count/numbers2.csv    |  4 +++
 .../stage/formats/parquet/infer_schema.test   |  9 ++++++
 7 files changed, 44 insertions(+), 17 deletions(-)
 create mode 100644 tests/data/csv/max_file_count/numbers0.csv
 create mode 100644 tests/data/csv/max_file_count/numbers1.csv
 create mode 100644 tests/data/csv/max_file_count/numbers2.csv

diff --git a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
index fd4ae9c947fcd..5e1ad8ac61b55 100644
--- a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
+++ b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
@@ -189,7 +189,9 @@ impl Table for InferSchemaTable {
             None => stage_info.file_format_params.clone(),
         };
         let operator = init_stage_operator(&stage_info)?;
-        let stage_file_infos = files_info.list(&operator, 1, None).await?;
+        let stage_file_infos = files_info
+            .list(&operator, 1, self.args_parsed.max_file_count)
+            .await?;
         Ok((
             PartStatistics::default(),
             Partitions::create(PartitionsShuffleKind::Seq, vec![
diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs
index a485d72356003..35bdbe76f3259 100644
--- a/src/query/service/src/table_functions/infer_schema/separator.rs
+++ b/src/query/service/src/table_functions/infer_schema/separator.rs
@@ -39,8 +39,8 @@ pub struct InferSchemaSeparator {
     pub file_format_params: FileFormatParams,
     files: HashMap<String, Vec<u8>>,
     pub max_records: Option<usize>,
-    schemas: Vec<Schema>,
-    files_len: usize,
+    schemas: Option<TableSchema>,
+    remaining_files_len: usize,
     is_finished: bool,
 }
 
@@ -54,8 +54,8 @@ impl InferSchemaSeparator {
             file_format_params,
             files: HashMap::new(),
             max_records,
-            schemas: Vec::with_capacity(files_len),
-            files_len,
+            schemas: None,
+            remaining_files_len: files_len,
             is_finished: false,
         }
     }
@@ -138,23 +138,20 @@ impl AccumulatingTransform for InferSchemaSeparator {
             }
         };
         self.files.remove(&batch.path);
-        self.schemas.push(arrow_schema);
 
-        if self.schemas.len() != self.files_len {
+        let merge_schema = match self.schemas.take() {
+            None => TableSchema::try_from(&arrow_schema)?,
+            Some(schema) => merge_schema(schema, TableSchema::try_from(&arrow_schema)?),
+        };
+        self.schemas = Some(merge_schema);
+
+        self.remaining_files_len = self.remaining_files_len.checked_sub(1).unwrap_or(0);
+        if self.remaining_files_len > 0 {
             return Ok(vec![DataBlock::empty()]);
         }
         self.is_finished = true;
-        if self.schemas.is_empty() {
+        let Some(table_schema) = self.schemas.take() else {
             return Ok(vec![DataBlock::empty()]);
-        }
-        let table_schema = if self.schemas.len() == 1 {
-            TableSchema::try_from(&self.schemas.pop().unwrap())?
-        } else {
-            self.schemas[1..]
-                .iter()
-                .try_fold(TableSchema::try_from(&self.schemas[0])?, |acc, schema| {
-                    TableSchema::try_from(schema).map(|schema| merge_schema(acc, schema))
-                })?
         };
 
         let mut names: Vec<String> = vec![];
diff --git a/src/query/service/src/table_functions/infer_schema/table_args.rs b/src/query/service/src/table_functions/infer_schema/table_args.rs
index 4bbf0ef113713..9781bc742ee4b 100644
--- a/src/query/service/src/table_functions/infer_schema/table_args.rs
+++ b/src/query/service/src/table_functions/infer_schema/table_args.rs
@@ -26,6 +26,7 @@ pub(crate) struct InferSchemaArgsParsed {
     pub(crate) file_format: Option<String>,
     pub(crate) files_info: StageFilesInfo,
     pub(crate) max_records: Option<usize>,
+    pub(crate) max_file_count: Option<usize>,
 }
 
 impl InferSchemaArgsParsed {
@@ -41,6 +42,7 @@ impl InferSchemaArgsParsed {
             pattern: None,
         };
         let mut max_records = None;
+        let mut max_file_count = None;
 
         for (k, v) in &args {
             match k.to_lowercase().as_str() {
@@ -59,6 +61,9 @@ impl InferSchemaArgsParsed {
                 "max_records_pre_file" => {
                     max_records = Some(i64_value(v)? as usize);
                 }
+                "max_file_count" => {
+                    max_file_count = Some(i64_value(v)? as usize);
+                }
                 _ => {
                     return Err(ErrorCode::BadArguments(format!(
                         "unknown param {} for infer_schema",
@@ -77,6 +82,7 @@ impl InferSchemaArgsParsed {
             file_format,
             files_info,
             max_records,
+            max_file_count,
         })
     }
 }
diff --git a/tests/data/csv/max_file_count/numbers0.csv b/tests/data/csv/max_file_count/numbers0.csv
new file mode 100644
index 0000000000000..d0abce6450294
--- /dev/null
+++ b/tests/data/csv/max_file_count/numbers0.csv
@@ -0,0 +1,5 @@
+col1,col2,col3,col4,col5
+0,1,2,3,4
+5,6,7,8,9
+10,11,12,13,14
+a,b,c,d,e
\ No newline at end of file
diff --git a/tests/data/csv/max_file_count/numbers1.csv b/tests/data/csv/max_file_count/numbers1.csv
new file mode 100644
index 0000000000000..a49bbf89b1d3d
--- /dev/null
+++ b/tests/data/csv/max_file_count/numbers1.csv
@@ -0,0 +1,4 @@
+col1,col2,col3,col4,col5
+0,1,2,3,4
+5,6,7,8,9
+10,11,12,13,14
\ No newline at end of file
diff --git a/tests/data/csv/max_file_count/numbers2.csv b/tests/data/csv/max_file_count/numbers2.csv
new file mode 100644
index 0000000000000..a49bbf89b1d3d
--- /dev/null
+++ b/tests/data/csv/max_file_count/numbers2.csv
@@ -0,0 +1,4 @@
+col1,col2,col3,col4,col5
+0,1,2,3,4
+5,6,7,8,9
+10,11,12,13,14
\ No newline at end of file
diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index 558617d91adc0..a29842ce01cfe 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -144,6 +144,15 @@ col3 VARCHAR 1 2
 col4 VARCHAR 1 3
 col5 VARCHAR 1 4
 
+query TTBI
+select * from infer_schema(location => '@data/csv/max_file_count/', file_format => 'head_csv_format', max_file_count => 2);
+----
+col1 BIGINT 1 0
+col2 BIGINT 1 1
+col3 BIGINT 1 2
+col4 BIGINT 1 3
+col5 BIGINT 1 4
+
 # NDJSON
 query TTBI
 select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON');

From b26101eddaf32baa9e18a7249fff61e4ca041c23 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Tue, 9 Sep 2025 14:03:39 +0800
Subject: [PATCH 18/20] chore: codefmt

---
 src/query/service/src/table_functions/infer_schema/separator.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs
index 35bdbe76f3259..c55b0429af645 100644
--- a/src/query/service/src/table_functions/infer_schema/separator.rs
+++ b/src/query/service/src/table_functions/infer_schema/separator.rs
@@ -145,7 +145,7 @@ impl AccumulatingTransform for InferSchemaSeparator {
         };
         self.schemas = Some(merge_schema);
 
-        self.remaining_files_len = self.remaining_files_len.checked_sub(1).unwrap_or(0);
+        self.remaining_files_len = self.remaining_files_len.saturating_sub(1);
         if self.remaining_files_len > 0 {
             return Ok(vec![DataBlock::empty()]);
         }

From fc6ce4be94a80b21cb7dc1a86a4210fc75c3cf71 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Tue, 9 Sep 2025 18:46:17 +0800
Subject: [PATCH 19/20] feat: impl `max_file_count` for `infer_schema`

---
 src/common/storage/src/stage.rs               |   2 +-
 src/meta/app/src/principal/file_format.rs     |  24 +-
 src/meta/app/src/principal/user_stage.rs      |  10 +-
 src/query/ast/src/ast/statements/copy.rs      |   4 +-
 .../infer_schema/infer_schema_table.rs        |   1 +
 .../table_functions/infer_schema/parquet.rs   |   9 +
 .../table_functions/infer_schema/separator.rs |  42 ++-
 src/query/storages/stage/src/infer_schema.rs  |   9 +-
 tests/data/csv/max_file_count/numbers0.csv    |   3 +-
 .../ndjson/max_file_count/numbers0.ndjson     |   3 +
 .../ndjson/max_file_count/numbers1.ndjson     |   3 +
 .../ndjson/max_file_count/numbers2.ndjson     |   3 +
 .../parquet/max_file_count/tuple0.parquet     | Bin 0 -> 2029 bytes
 .../parquet/max_file_count/tuple1.parquet     | Bin 0 -> 2029 bytes
 .../parquet/max_file_count/tuple2.parquet     | Bin 0 -> 2029 bytes
 .../stage/formats/parquet/infer_schema.test   | 297 ++++++++++--------
 16 files changed, 240 insertions(+), 170 deletions(-)
 create mode 100644 tests/data/ndjson/max_file_count/numbers0.ndjson
 create mode 100644 tests/data/ndjson/max_file_count/numbers1.ndjson
 create mode 100644 tests/data/ndjson/max_file_count/numbers2.ndjson
 create mode 100644 tests/data/parquet/max_file_count/tuple0.parquet
 create mode 100644 tests/data/parquet/max_file_count/tuple1.parquet
 create mode 100644 tests/data/parquet/max_file_count/tuple2.parquet

diff --git a/src/common/storage/src/stage.rs b/src/common/storage/src/stage.rs
index 6b863ff4e5252..4ce56be4e1f67 100644
--- a/src/common/storage/src/stage.rs
+++ b/src/common/storage/src/stage.rs
@@ -98,7 +98,7 @@ pub fn init_stage_operator(stage_info: &StageInfo) -> Result<Operator> {
 }
 /// select * from @s1/<path> (FILES => <files> PATTERN => <pattern>)
 /// copy from @s1/<path> FILES = <files> PATTERN => <pattern>
-#[derive(serde::Serialize, serde::Deserialize, Clone, PartialEq, Eq, Debug, Hash)]
+#[derive(serde::Serialize, serde::Deserialize, Clone, PartialEq, Eq, Debug)]
 pub struct StageFilesInfo {
     pub path: String,
     pub files: Option<Vec<String>>,
diff --git a/src/meta/app/src/principal/file_format.rs b/src/meta/app/src/principal/file_format.rs
index 8fc90ce74c79e..19e829c44e2ee 100644
--- a/src/meta/app/src/principal/file_format.rs
+++ b/src/meta/app/src/principal/file_format.rs
@@ -52,7 +52,7 @@ const OPT_BINARY_FORMAT: &str = "binary_format";
 const OPT_USE_LOGIC_TYPE: &str = "use_logic_type";
 
 /// File format parameters after checking and parsing.
-#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "type")]
 pub enum FileFormatParams {
     Csv(CsvFileFormatParams),
@@ -446,7 +446,7 @@ impl FileFormatOptionsReader {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct CsvFileFormatParams {
     pub compression: StageFileCompression,
 
@@ -498,7 +498,7 @@ impl CsvFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct TsvFileFormatParams {
     pub compression: StageFileCompression,
     pub headers: u64,
@@ -532,7 +532,7 @@ impl TsvFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct XmlFileFormatParams {
     pub compression: StageFileCompression,
     pub row_tag: String,
@@ -558,7 +558,7 @@ impl Default for XmlFileFormatParams {
 
 /// used for both `missing_field_as` and `null_field_as`
 /// for extensibility, it is stored as PB string in meta
-#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
 pub enum NullAs {
     /// for `missing_field_as` only, and is default for it for safety,
     /// in case of wrong field names when creating table.
@@ -570,7 +570,7 @@ pub enum NullAs {
     FieldDefault,
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
 pub enum EmptyFieldAs {
     #[default]
     Null,
@@ -638,7 +638,7 @@ impl Display for NullAs {
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
 pub enum BinaryFormat {
     #[default]
     Hex,
@@ -668,7 +668,7 @@ impl Display for BinaryFormat {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct JsonFileFormatParams {
     pub compression: StageFileCompression,
 }
@@ -690,7 +690,7 @@ impl Default for JsonFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct NdJsonFileFormatParams {
     pub compression: StageFileCompression,
     pub missing_field_as: NullAs,
@@ -741,7 +741,7 @@ impl NdJsonFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct AvroFileFormatParams {
     pub compression: StageFileCompression,
     pub missing_field_as: NullAs,
@@ -791,7 +791,7 @@ impl AvroFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct ParquetFileFormatParams {
     // used only for unload
     pub compression: StageFileCompression,
@@ -828,7 +828,7 @@ impl ParquetFileFormatParams {
     }
 }
 
-#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
 pub struct OrcFileFormatParams {
     pub missing_field_as: NullAs,
 }
diff --git a/src/meta/app/src/principal/user_stage.rs b/src/meta/app/src/principal/user_stage.rs
index c2261b288c6d4..92da76b413c07 100644
--- a/src/meta/app/src/principal/user_stage.rs
+++ b/src/meta/app/src/principal/user_stage.rs
@@ -60,7 +60,7 @@ pub const COPY_MAX_FILES_PER_COMMIT: usize = 15000;
 /// Instruction for exceeding 'copy into table' file limit.
 pub const COPY_MAX_FILES_COMMIT_MSG: &str = "Commit limit reached: 15,000 files for 'copy into table'. To handle more files, adjust 'CopyOption' with 'max_files=<num>'(e.g., 'max_files=10000') and perform several operations until all files are processed.";
 
-#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Eq, PartialEq, Hash)]
+#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Eq, PartialEq)]
 pub enum StageType {
     /// LegacyInternal will be deprecated.
     ///
@@ -96,7 +96,7 @@ impl Default for StageType {
     }
 }
 
-#[derive(serde::Serialize, serde::Deserialize, Clone, Copy, Debug, Eq, PartialEq, Hash)]
+#[derive(serde::Serialize, serde::Deserialize, Clone, Copy, Debug, Eq, PartialEq)]
 pub enum StageFileCompression {
     Auto,
     Gzip,
@@ -396,13 +396,13 @@ impl Display for FileFormatOptions {
     }
 }
 
-#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq, Hash)]
+#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq)]
 #[serde(default)]
 pub struct StageParams {
     pub storage: StorageParams,
 }
 
-#[derive(serde::Serialize, serde::Deserialize, Clone, Default, Debug, Eq, PartialEq, Hash)]
+#[derive(serde::Serialize, serde::Deserialize, Clone, Default, Debug, Eq, PartialEq)]
 #[serde(default)]
 pub struct CopyOptions {
     pub on_error: OnErrorMode,
@@ -419,7 +419,7 @@ pub struct CopyOptions {
     pub detailed_output: bool,
 }
 
-#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq, Hash)]
+#[derive(serde::Serialize, serde::Deserialize, Default, Clone, Debug, Eq, PartialEq)]
 #[serde(default)]
 pub struct StageInfo {
     pub stage_name: String,
diff --git a/src/query/ast/src/ast/statements/copy.rs b/src/query/ast/src/ast/statements/copy.rs
index b207b05d7d879..8e10e37318270 100644
--- a/src/query/ast/src/ast/statements/copy.rs
+++ b/src/query/ast/src/ast/statements/copy.rs
@@ -648,9 +648,7 @@ impl Display for FileFormatValue {
     }
 }
 
-#[derive(
-    serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq, Hash,
-)]
+#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq)]
 pub enum OnErrorMode {
     Continue,
     SkipFileNum(u64),
diff --git a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
index 5e1ad8ac61b55..f946aead234e0 100644
--- a/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
+++ b/src/query/service/src/table_functions/infer_schema/infer_schema_table.rs
@@ -102,6 +102,7 @@ impl InferSchemaTable {
             TableField::new("column_name", TableDataType::String),
             TableField::new("type", TableDataType::String),
             TableField::new("nullable", TableDataType::Boolean),
+            TableField::new("filenames", TableDataType::String),
             TableField::new("order_id", TableDataType::Number(NumberDataType::UInt64)),
         ])
     }
diff --git a/src/query/service/src/table_functions/infer_schema/parquet.rs b/src/query/service/src/table_functions/infer_schema/parquet.rs
index 38e15819d0eb7..5db9713b9ac69 100644
--- a/src/query/service/src/table_functions/infer_schema/parquet.rs
+++ b/src/query/service/src/table_functions/infer_schema/parquet.rs
@@ -32,6 +32,7 @@ use databend_common_storage::init_stage_operator;
 use databend_common_storage::read_parquet_schema_async_rs;
 use databend_common_storage::StageFileInfo;
 use futures_util::future::try_join_all;
+use itertools::Itertools;
 
 use crate::table_functions::infer_schema::infer_schema_table::INFER_SCHEMA;
 
@@ -79,6 +80,12 @@ impl AsyncSource for ParquetInferSchemaSource {
         let mut names: Vec<String> = vec![];
         let mut types: Vec<String> = vec![];
         let mut nulls: Vec<bool> = vec![];
+        let mut filenames: Vec<String> = vec![];
+        let filenames_str = self
+            .stage_file_infos
+            .iter()
+            .map(|info| &info.path)
+            .join(", ");
 
         for field in table_schema.fields().iter() {
             names.push(field.name().to_string());
@@ -86,6 +93,7 @@ impl AsyncSource for ParquetInferSchemaSource {
             let non_null_type = field.data_type().remove_recursive_nullable();
             types.push(non_null_type.sql_name());
             nulls.push(field.is_nullable());
+            filenames.push(filenames_str.clone());
         }
 
         let order_ids = (0..table_schema.fields().len() as u64).collect::<Vec<_>>();
@@ -94,6 +102,7 @@ impl AsyncSource for ParquetInferSchemaSource {
             StringType::from_data(names),
             StringType::from_data(types),
             BooleanType::from_data(nulls),
+            StringType::from_data(filenames),
             UInt64Type::from_data(order_ids),
         ]);
         Ok(Some(block))
diff --git a/src/query/service/src/table_functions/infer_schema/separator.rs b/src/query/service/src/table_functions/infer_schema/separator.rs
index c55b0429af645..b5607f0a3b9f1 100644
--- a/src/query/service/src/table_functions/infer_schema/separator.rs
+++ b/src/query/service/src/table_functions/infer_schema/separator.rs
@@ -32,15 +32,19 @@ use databend_common_expression::TableSchema;
 use databend_common_meta_app::principal::FileFormatParams;
 use databend_common_pipeline_transforms::AccumulatingTransform;
 use databend_common_storages_stage::BytesBatch;
+use itertools::Itertools;
 
 use crate::table_functions::infer_schema::merge::merge_schema;
 
+const MAX_SINGLE_FILE_BYTES: usize = 100 * 1024 * 1024;
+
 pub struct InferSchemaSeparator {
     pub file_format_params: FileFormatParams,
     files: HashMap<String, Vec<u8>>,
     pub max_records: Option<usize>,
     schemas: Option<TableSchema>,
-    remaining_files_len: usize,
+    files_len: usize,
+    filenames: Vec<String>,
     is_finished: bool,
 }
 
@@ -55,7 +59,8 @@ impl InferSchemaSeparator {
             files: HashMap::new(),
             max_records,
             schemas: None,
-            remaining_files_len: files_len,
+            files_len,
+            filenames: Vec::with_capacity(files_len),
             is_finished: false,
         }
     }
@@ -76,6 +81,14 @@ impl AccumulatingTransform for InferSchemaSeparator {
         let bytes = self.files.entry(batch.path.clone()).or_default();
         bytes.extend(batch.data);
 
+        if bytes.len() > MAX_SINGLE_FILE_BYTES {
+            return Err(ErrorCode::InvalidArgument(format!(
+                "The file '{}' is too large(maximum allowed: {})",
+                batch.path,
+                human_readable_size(MAX_SINGLE_FILE_BYTES),
+            )));
+        }
+
         // When max_records exists, it will try to use the current bytes to read, otherwise it will buffer all bytes
         if self.max_records.is_none() && !batch.is_eof {
             return Ok(vec![DataBlock::empty()]);
@@ -138,6 +151,7 @@ impl AccumulatingTransform for InferSchemaSeparator {
             }
         };
         self.files.remove(&batch.path);
+        self.filenames.push(batch.path);
 
         let merge_schema = match self.schemas.take() {
             None => TableSchema::try_from(&arrow_schema)?,
@@ -145,8 +159,7 @@ impl AccumulatingTransform for InferSchemaSeparator {
         };
         self.schemas = Some(merge_schema);
 
-        self.remaining_files_len = self.remaining_files_len.saturating_sub(1);
-        if self.remaining_files_len > 0 {
+        if self.files_len > self.filenames.len() {
             return Ok(vec![DataBlock::empty()]);
         }
         self.is_finished = true;
@@ -157,6 +170,8 @@ impl AccumulatingTransform for InferSchemaSeparator {
         let mut names: Vec<String> = vec![];
         let mut types: Vec<String> = vec![];
         let mut nulls: Vec<bool> = vec![];
+        let mut filenames: Vec<String> = vec![];
+        let filenames_str = self.filenames.iter().join(", ");
 
         for field in table_schema.fields().iter() {
             names.push(field.name().to_string());
@@ -164,6 +179,7 @@ impl AccumulatingTransform for InferSchemaSeparator {
             let non_null_type = field.data_type().remove_recursive_nullable();
             types.push(non_null_type.sql_name());
             nulls.push(field.is_nullable());
+            filenames.push(filenames_str.clone());
         }
 
         let order_ids = (0..table_schema.fields().len() as u64).collect::<Vec<_>>();
@@ -172,8 +188,26 @@ impl AccumulatingTransform for InferSchemaSeparator {
             StringType::from_data(names),
             StringType::from_data(types),
             BooleanType::from_data(nulls),
+            StringType::from_data(filenames),
             UInt64Type::from_data(order_ids),
         ]);
         Ok(vec![block])
     }
 }
+
+fn human_readable_size(bytes: usize) -> String {
+    const KB: f64 = 1024.0;
+    const MB: f64 = KB * 1024.0;
+    const GB: f64 = MB * 1024.0;
+
+    let b = bytes as f64;
+    if b >= GB {
+        format!("{:.2} GB", b / GB)
+    } else if b >= MB {
+        format!("{:.2} MB", b / MB)
+    } else if b >= KB {
+        format!("{:.2} KB", b / KB)
+    } else {
+        format!("{} B", bytes)
+    }
+}
diff --git a/src/query/storages/stage/src/infer_schema.rs b/src/query/storages/stage/src/infer_schema.rs
index 5a3c25a6c4910..77a961594992e 100644
--- a/src/query/storages/stage/src/infer_schema.rs
+++ b/src/query/storages/stage/src/infer_schema.rs
@@ -13,9 +13,6 @@
 // limitations under the License.
 
 use std::any::Any;
-use std::hash::DefaultHasher;
-use std::hash::Hash;
-use std::hash::Hasher;
 use std::sync::Arc;
 
 use databend_common_catalog::plan::PartInfo;
@@ -48,11 +45,7 @@ impl PartInfo for InferSchemaPartInfo {
     }
 
     fn hash(&self) -> u64 {
-        let mut s = DefaultHasher::new();
-        self.files_info.hash(&mut s);
-        self.file_format_params.hash(&mut s);
-        self.stage_info.hash(&mut s);
-        s.finish()
+        0
     }
 
     fn part_type(&self) -> PartInfoType {
diff --git a/tests/data/csv/max_file_count/numbers0.csv b/tests/data/csv/max_file_count/numbers0.csv
index d0abce6450294..a49bbf89b1d3d 100644
--- a/tests/data/csv/max_file_count/numbers0.csv
+++ b/tests/data/csv/max_file_count/numbers0.csv
@@ -1,5 +1,4 @@
 col1,col2,col3,col4,col5
 0,1,2,3,4
 5,6,7,8,9
-10,11,12,13,14
-a,b,c,d,e
\ No newline at end of file
+10,11,12,13,14
\ No newline at end of file
diff --git a/tests/data/ndjson/max_file_count/numbers0.ndjson b/tests/data/ndjson/max_file_count/numbers0.ndjson
new file mode 100644
index 0000000000000..aecddc3762d07
--- /dev/null
+++ b/tests/data/ndjson/max_file_count/numbers0.ndjson
@@ -0,0 +1,3 @@
+{"id": 1, "value": 100}
+{"id": 2, "value": 200}
+{"id": 3, "value": 300}
diff --git a/tests/data/ndjson/max_file_count/numbers1.ndjson b/tests/data/ndjson/max_file_count/numbers1.ndjson
new file mode 100644
index 0000000000000..aecddc3762d07
--- /dev/null
+++ b/tests/data/ndjson/max_file_count/numbers1.ndjson
@@ -0,0 +1,3 @@
+{"id": 1, "value": 100}
+{"id": 2, "value": 200}
+{"id": 3, "value": 300}
diff --git a/tests/data/ndjson/max_file_count/numbers2.ndjson b/tests/data/ndjson/max_file_count/numbers2.ndjson
new file mode 100644
index 0000000000000..aecddc3762d07
--- /dev/null
+++ b/tests/data/ndjson/max_file_count/numbers2.ndjson
@@ -0,0 +1,3 @@
+{"id": 1, "value": 100}
+{"id": 2, "value": 200}
+{"id": 3, "value": 300}
diff --git a/tests/data/parquet/max_file_count/tuple0.parquet b/tests/data/parquet/max_file_count/tuple0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..53ccb995f5badcb5bb0be445de959d8102bf44e4
GIT binary patch
literal 2029
zcmb_d-EQJW6dnU>p$e%{wMIrl6|%Z07i<ay(q*-kIwnA%B%6>82Deq@zkrEtTrdF=
zt<(qTWz~yb^s<+|=mYd2D)lX@Y9FEJj3FWEk6LxDcs%~TGw1u}%rQBpoq`Y(L?PW4
zc!9h}NFv7(LLzWS;TDJx7ZN$(w&3Q|Vhm9+hA~O9z3~P>ED?!*K(aq@X(5s-6Mict
zMr>1vB>CjGT=IJ^`$;zbPZ?4<utJ0gs&2sp<YO>-+(sCz#nULevc(JU{7YNWI2Yx!
zzeI0cZF2!fnXRw6<TqUQ_el2V$bUBWz96k@_U<z@rs1IDU_30_9@*v$k<)-%73s~R
zw2%mkgmz?RXOaAh%l`N%`zDsYUGY~%Nqt@>B0n{Tma7RdsMr&Nl_r34!5ZKw5fVUf
z@)OcdgL0BjrBg57=JCAcqGta`QIg+9lYBOw{3-gFkW^0Oea$nqsrYN;MzTHATAUkW
zXXbiS>3Qa}FLD_vd_8BJ^#EnBH<G8h7KBT-DZzpZ+j7kF`&cM)9xVMZ&<l2p80*bk
zf4K?ol@NBVKr^*KgQVWfabD*#cOt=`H5l|jVyurWgAw;N?(l~qY_k(f3&s=By^$PS
zpJ<aMZp>e5lgW6F%2(E8YL7ic3#Fa>PEi7P(n@@TEbbI`O44g0CTOSgQh6RWq4*pB
zAzr0Hl~Q^FH(sAoTB7oR)+rs(>VRRge2<?aSi9w$bD;4ZE|!JyO-kn!2zat2@LUJ=
zVJ6S}O8K(iJEXLJ24V6?P1l*Jjc%z|Ji5{kx`FN;+AnQc@14#EO?QPk6<>Edj@9U1
zH%H@vQpiu6)z(NaE(*F*4zb;-t-JL=J#Fp*=d0Bw#<-f&ovQW9tgjRtTnl=aO&fB1
z@(^+kjY1`F_2e}-GxradjCacgy(gD@yPdMpI0H}A)|7JoXmnz?tJKz&ZoaQ{hGwIN
zerKk#m|lQqYuUQf^1@ivbsW9Xx!CZR|9cM=TXVZ3GxQI63v;Wx>G2*PsJ+fmZPfF9
z+;;;!Hx`vu?Ut%i&+EHQ^apib_<gT6G`vf;7vsQzdF#qtvN{#jUh7&@Dl=B2Q7zvd
z(b=Khs^DIYMjg(me%;$u9m6~F^<s(TIJ8TPHq_&Soj)FR6%QHDA39(kn(P_Qjv6@1
z>;8c`w#Ght5Bm?t1041ZY|Y6r42(WwJX?R;st|@~ZQWWYq2AG}wO*AqrOcLw{R!b=
z7h+3Oy130>V@(P{XRCfa!2X46l@zCRm2bm8gHd1hRScjM8hU_RWTqH>%Eph0nDMpA
jmoqDPYWV(B_7P`RA8981d{ZyrCqMin1)h*&_(%T-pz+dT

literal 0
HcmV?d00001

diff --git a/tests/data/parquet/max_file_count/tuple1.parquet b/tests/data/parquet/max_file_count/tuple1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..53ccb995f5badcb5bb0be445de959d8102bf44e4
GIT binary patch
literal 2029
zcmb_d-EQJW6dnU>p$e%{wMIrl6|%Z07i<ay(q*-kIwnA%B%6>82Deq@zkrEtTrdF=
zt<(qTWz~yb^s<+|=mYd2D)lX@Y9FEJj3FWEk6LxDcs%~TGw1u}%rQBpoq`Y(L?PW4
zc!9h}NFv7(LLzWS;TDJx7ZN$(w&3Q|Vhm9+hA~O9z3~P>ED?!*K(aq@X(5s-6Mict
zMr>1vB>CjGT=IJ^`$;zbPZ?4<utJ0gs&2sp<YO>-+(sCz#nULevc(JU{7YNWI2Yx!
zzeI0cZF2!fnXRw6<TqUQ_el2V$bUBWz96k@_U<z@rs1IDU_30_9@*v$k<)-%73s~R
zw2%mkgmz?RXOaAh%l`N%`zDsYUGY~%Nqt@>B0n{Tma7RdsMr&Nl_r34!5ZKw5fVUf
z@)OcdgL0BjrBg57=JCAcqGta`QIg+9lYBOw{3-gFkW^0Oea$nqsrYN;MzTHATAUkW
zXXbiS>3Qa}FLD_vd_8BJ^#EnBH<G8h7KBT-DZzpZ+j7kF`&cM)9xVMZ&<l2p80*bk
zf4K?ol@NBVKr^*KgQVWfabD*#cOt=`H5l|jVyurWgAw;N?(l~qY_k(f3&s=By^$PS
zpJ<aMZp>e5lgW6F%2(E8YL7ic3#Fa>PEi7P(n@@TEbbI`O44g0CTOSgQh6RWq4*pB
zAzr0Hl~Q^FH(sAoTB7oR)+rs(>VRRge2<?aSi9w$bD;4ZE|!JyO-kn!2zat2@LUJ=
zVJ6S}O8K(iJEXLJ24V6?P1l*Jjc%z|Ji5{kx`FN;+AnQc@14#EO?QPk6<>Edj@9U1
zH%H@vQpiu6)z(NaE(*F*4zb;-t-JL=J#Fp*=d0Bw#<-f&ovQW9tgjRtTnl=aO&fB1
z@(^+kjY1`F_2e}-GxradjCacgy(gD@yPdMpI0H}A)|7JoXmnz?tJKz&ZoaQ{hGwIN
zerKk#m|lQqYuUQf^1@ivbsW9Xx!CZR|9cM=TXVZ3GxQI63v;Wx>G2*PsJ+fmZPfF9
z+;;;!Hx`vu?Ut%i&+EHQ^apib_<gT6G`vf;7vsQzdF#qtvN{#jUh7&@Dl=B2Q7zvd
z(b=Khs^DIYMjg(me%;$u9m6~F^<s(TIJ8TPHq_&Soj)FR6%QHDA39(kn(P_Qjv6@1
z>;8c`w#Ght5Bm?t1041ZY|Y6r42(WwJX?R;st|@~ZQWWYq2AG}wO*AqrOcLw{R!b=
z7h+3Oy130>V@(P{XRCfa!2X46l@zCRm2bm8gHd1hRScjM8hU_RWTqH>%Eph0nDMpA
jmoqDPYWV(B_7P`RA8981d{ZyrCqMin1)h*&_(%T-pz+dT

literal 0
HcmV?d00001

diff --git a/tests/data/parquet/max_file_count/tuple2.parquet b/tests/data/parquet/max_file_count/tuple2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..53ccb995f5badcb5bb0be445de959d8102bf44e4
GIT binary patch
literal 2029
zcmb_d-EQJW6dnU>p$e%{wMIrl6|%Z07i<ay(q*-kIwnA%B%6>82Deq@zkrEtTrdF=
zt<(qTWz~yb^s<+|=mYd2D)lX@Y9FEJj3FWEk6LxDcs%~TGw1u}%rQBpoq`Y(L?PW4
zc!9h}NFv7(LLzWS;TDJx7ZN$(w&3Q|Vhm9+hA~O9z3~P>ED?!*K(aq@X(5s-6Mict
zMr>1vB>CjGT=IJ^`$;zbPZ?4<utJ0gs&2sp<YO>-+(sCz#nULevc(JU{7YNWI2Yx!
zzeI0cZF2!fnXRw6<TqUQ_el2V$bUBWz96k@_U<z@rs1IDU_30_9@*v$k<)-%73s~R
zw2%mkgmz?RXOaAh%l`N%`zDsYUGY~%Nqt@>B0n{Tma7RdsMr&Nl_r34!5ZKw5fVUf
z@)OcdgL0BjrBg57=JCAcqGta`QIg+9lYBOw{3-gFkW^0Oea$nqsrYN;MzTHATAUkW
zXXbiS>3Qa}FLD_vd_8BJ^#EnBH<G8h7KBT-DZzpZ+j7kF`&cM)9xVMZ&<l2p80*bk
zf4K?ol@NBVKr^*KgQVWfabD*#cOt=`H5l|jVyurWgAw;N?(l~qY_k(f3&s=By^$PS
zpJ<aMZp>e5lgW6F%2(E8YL7ic3#Fa>PEi7P(n@@TEbbI`O44g0CTOSgQh6RWq4*pB
zAzr0Hl~Q^FH(sAoTB7oR)+rs(>VRRge2<?aSi9w$bD;4ZE|!JyO-kn!2zat2@LUJ=
zVJ6S}O8K(iJEXLJ24V6?P1l*Jjc%z|Ji5{kx`FN;+AnQc@14#EO?QPk6<>Edj@9U1
zH%H@vQpiu6)z(NaE(*F*4zb;-t-JL=J#Fp*=d0Bw#<-f&ovQW9tgjRtTnl=aO&fB1
z@(^+kjY1`F_2e}-GxradjCacgy(gD@yPdMpI0H}A)|7JoXmnz?tJKz&ZoaQ{hGwIN
zerKk#m|lQqYuUQf^1@ivbsW9Xx!CZR|9cM=TXVZ3GxQI63v;Wx>G2*PsJ+fmZPfF9
z+;;;!Hx`vu?Ut%i&+EHQ^apib_<gT6G`vf;7vsQzdF#qtvN{#jUh7&@Dl=B2Q7zvd
z(b=Khs^DIYMjg(me%;$u9m6~F^<s(TIJ8TPHq_&Soj)FR6%QHDA39(kn(P_Qjv6@1
z>;8c`w#Ght5Bm?t1041ZY|Y6r42(WwJX?R;st|@~ZQWWYq2AG}wO*AqrOcLw{R!b=
z7h+3Oy130>V@(P{XRCfa!2X46l@zCRm2bm8gHd1hRScjM8hU_RWTqH>%Eph0nDMpA
jmoqDPYWV(B_7P`RA8981d{ZyrCqMin1)h*&_(%T-pz+dT

literal 0
HcmV?d00001

diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index a29842ce01cfe..cf67ad1ea49ca 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -5,55 +5,55 @@ select * from infer_schema(location => '@data/invalid_xxx/tuple.parquet')
 query
 select * from infer_schema(location => '@data/parquet/tuple.parquet')
 ----
-id INT 0 0
-t TUPLE(A INT32, B STRING) 0 1
+id INT 0 parquet/tuple.parquet 0
+t TUPLE(A INT32, B STRING) 0 parquet/tuple.parquet 1
 
 query
 select * from infer_schema(location => '@data/parquet/complex.parquet')
 ----
-resourceType VARCHAR 1 0
-id VARCHAR 1 1
-meta TUPLE(ID STRING, EXTENSION ARRAY(STRING), VERSIONID STRING, LASTUPDATED TIMESTAMP, SOURCE STRING, PROFILE ARRAY(STRING), SECURITY ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TAG ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN))) 1 2
-implicitRules VARCHAR 1 3
-language VARCHAR 1 4
-text TUPLE(ID STRING, EXTENSION ARRAY(STRING), STATUS STRING, DIV STRING) 1 5
-contained ARRAY(STRING) 1 6
-extension ARRAY(STRING) 1 7
-modifierExtension ARRAY(STRING) 1 8
-identifier ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING))) 1 9
-active BOOLEAN 1 10
-name ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TEXT STRING, FAMILY STRING, GIVEN ARRAY(STRING), PREFIX ARRAY(STRING), SUFFIX ARRAY(STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 11
-telecom ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VALUE STRING, USE STRING, RANK INT32, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 12
-gender VARCHAR 1 13
-birthDate DATE 1 14
-deceasedBoolean BOOLEAN 1 15
-deceasedDateTime TIMESTAMP 1 16
-address ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE STRING, TEXT STRING, LINE ARRAY(STRING), CITY STRING, DISTRICT STRING, STATE STRING, POSTALCODE STRING, COUNTRY STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 17
-maritalStatus TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING) 1 18
-multipleBirthBoolean BOOLEAN 1 19
-multipleBirthInteger INT 1 20
-photo ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), CONTENTTYPE STRING, LANGUAGE STRING, DATA BINARY, URL STRING, SIZE INT32, HASH BINARY, TITLE STRING, CREATION TIMESTAMP)) 1 21
-contact ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), RELATIONSHIP ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING)), NAME TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TEXT STRING, FAMILY STRING, GIVEN ARRAY(STRING), PREFIX ARRAY(STRING), SUFFIX ARRAY(STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP)), TELECOM ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VALUE STRING, USE STRING, RANK INT32, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))), ADDRESS TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE STRING, TEXT STRING, LINE ARRAY(STRING), CITY STRING, DISTRICT STRING, STATE STRING, POSTALCODE STRING, COUNTRY STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP)), GENDER STRING, ORGANIZATION TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 22
-communication ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), LANGUAGE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), PREFERRED BOOLEAN)) 1 23
-generalPractitioner ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING)) 1 24
-managingOrganization TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING) 1 25
-link ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), OTHER TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING), TYPE STRING)) 1 26
-yy__version INT 1 27
-yy__us_core_race VARCHAR 1 28
-yy__us_core_ethnicity VARCHAR 1 29
-yy__us_core_birthsex TUPLE(VALUECODE STRING,) 1 30
+resourceType VARCHAR 1 parquet/complex.parquet 0
+id VARCHAR 1 parquet/complex.parquet 1
+meta TUPLE(ID STRING, EXTENSION ARRAY(STRING), VERSIONID STRING, LASTUPDATED TIMESTAMP, SOURCE STRING, PROFILE ARRAY(STRING), SECURITY ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TAG ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN))) 1 parquet/complex.parquet 2                                                                                                                                                                    
+implicitRules VARCHAR 1 parquet/complex.parquet 3
+language VARCHAR 1 parquet/complex.parquet 4
+text TUPLE(ID STRING, EXTENSION ARRAY(STRING), STATUS STRING, DIV STRING) 1 parquet/complex.parquet 5
+contained ARRAY(STRING) 1 parquet/complex.parquet 6
+extension ARRAY(STRING) 1 parquet/complex.parquet 7
+modifierExtension ARRAY(STRING) 1 parquet/complex.parquet 8
+identifier ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING))) 1 parquet/complex.parquet 9                                                                                                                                                                                                                                                                                                 
+active BOOLEAN 1 parquet/complex.parquet 10
+name ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TEXT STRING, FAMILY STRING, GIVEN ARRAY(STRING), PREFIX ARRAY(STRING), SUFFIX ARRAY(STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 parquet/complex.parquet 11
+telecom ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VALUE STRING, USE STRING, RANK INT32, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 parquet/complex.parquet 12
+gender VARCHAR 1 parquet/complex.parquet 13
+birthDate DATE 1 parquet/complex.parquet 14
+deceasedBoolean BOOLEAN 1 parquet/complex.parquet 15
+deceasedDateTime TIMESTAMP 1 parquet/complex.parquet 16
+address ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE STRING, TEXT STRING, LINE ARRAY(STRING), CITY STRING, DISTRICT STRING, STATE STRING, POSTALCODE STRING, COUNTRY STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 parquet/complex.parquet 17                                                                                                                                                                                                                                                                                                  
+maritalStatus TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING) 1 parquet/complex.parquet 18
+multipleBirthBoolean BOOLEAN 1 parquet/complex.parquet 19
+multipleBirthInteger INT 1 parquet/complex.parquet 20
+photo ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), CONTENTTYPE STRING, LANGUAGE STRING, DATA BINARY, URL STRING, SIZE INT32, HASH BINARY, TITLE STRING, CREATION TIMESTAMP)) 1 parquet/complex.parquet 21
+contact ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), RELATIONSHIP ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING)), NAME TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TEXT STRING, FAMILY STRING, GIVEN ARRAY(STRING), PREFIX ARRAY(STRING), SUFFIX ARRAY(STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP)), TELECOM ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VALUE STRING, USE STRING, RANK INT32, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))), ADDRESS TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE STRING, TEXT STRING, LINE ARRAY(STRING), CITY STRING, DISTRICT STRING, STATE STRING, POSTALCODE STRING, COUNTRY STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP)), GENDER STRING, ORGANIZATION TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING), PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP))) 1 parquet/complex.parquet 22                                                                                                                                                                     
+communication ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), LANGUAGE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), PREFERRED BOOLEAN)) 1 parquet/complex.parquet 23                                                                                                                                                                                                                                                       
+generalPractitioner ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING)) 1 parquet/complex.parquet 24                                                        
+managingOrganization TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING) 1 parquet/complex.parquet 25                                                              
+link ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), MODIFIEREXTENSION ARRAY(STRING), OTHER TUPLE(ID STRING, EXTENSION ARRAY(STRING), REFERENCE STRING, TYPE STRING, IDENTIFIER TUPLE(ID STRING, EXTENSION ARRAY(STRING), USE STRING, TYPE TUPLE(ID STRING, EXTENSION ARRAY(STRING), CODING ARRAY(TUPLE(ID STRING, EXTENSION ARRAY(STRING), SYSTEM STRING, VERSION STRING, CODE STRING, DISPLAY STRING, USERSELECTED BOOLEAN)), TEXT STRING), SYSTEM STRING, VALUE STRING, PERIOD TUPLE(ID STRING, EXTENSION ARRAY(STRING), START TIMESTAMP, END TIMESTAMP), ASSIGNER STRING), DISPLAY STRING), TYPE STRING)) 1 parquet/complex.parquet 26                                                                                                                                                                                                                                                                                   
+yy__version INT 1 parquet/complex.parquet 27
+yy__us_core_race VARCHAR 1 parquet/complex.parquet 28
+yy__us_core_ethnicity VARCHAR 1 parquet/complex.parquet 29
+yy__us_core_birthsex TUPLE(VALUECODE STRING,) 1 parquet/complex.parquet 30
 
 query
 select * from infer_schema(location => '@data/parquet/variant.parquet')
 ----
-a INT 0 0
-b VARIANT 0 1
+a INT 0 parquet/variant.parquet 0
+b VARIANT 0 parquet/variant.parquet 1
 
 query
 select * from infer_schema(location => '@data/parquet/', FILE_FORMAT => 'PARQUET',  pattern => 'tuple.*')
 ----
-id INT 0 0
-t TUPLE(A INT32, B STRING) 0 1
+id INT 0 parquet/tuple.parquet 0
+t TUPLE(A INT32, B STRING) 0 parquet/tuple.parquet 1
 
 statement ok
 drop CONNECTION IF EXISTS my_conn
@@ -61,168 +61,195 @@ drop CONNECTION IF EXISTS my_conn
 statement ok
 create CONNECTION my_conn STORAGE_TYPE = 's3' access_key_id='minioadmin' secret_access_key='minioadmin' endpoint_url='http://127.0.0.1:9900/' region='auto'
 
-query
-select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn')
+# query
+# select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn')
+# ----
+# id INT 0 parquet/tuple.parquet 0
+# t TUPLE(A INT32, B STRING) 0 parquet/tuple.parquet 1
+
+query T
+select CASE
+               WHEN filenames LIKE '%,%'
+               THEN 'Y'
+               ELSE 'N'
+           END AS format_check
+from infer_schema(location => '@data/parquet/max_file_count', max_file_count => 2)
 ----
-id INT 0 0
-t TUPLE(A INT32, B STRING) 0 1
+Y
+Y
 
 # CSV
 statement ok
 create or replace file format head_csv_format type = 'CSV' field_delimiter = ',' skip_header = 1;
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/csv/numbers_with_headers.csv', file_format => 'CSV');
 ----
-column_1 VARCHAR 1 0
-column_2 VARCHAR 1 1
+column_1 VARCHAR 1 csv/numbers_with_headers.csv 0
+column_2 VARCHAR 1 csv/numbers_with_headers.csv 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/csv/numbers_with_headers.csv', file_format => 'head_csv_format');
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 csv/numbers_with_headers.csv 0
+value BIGINT 1 csv/numbers_with_headers.csv 1
 
 statement error
 select * from infer_schema(location => '@data/csv/ragged.csv', file_format => 'head_csv_format');
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format');
 ----
-id BIGINT 1 0
-value VARCHAR 1 1
+id BIGINT 1 csv/max_records.csv 0
+value VARCHAR 1 csv/max_records.csv 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5);
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 csv/max_records.csv 0
+value BIGINT 1 csv/max_records.csv 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/csv/max_records.zip', file_format => 'head_csv_format', max_records_pre_file => 5);
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 csv/max_records.zip 0
+value BIGINT 1 csv/max_records.zip 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/csv/max_records.zst', file_format => 'head_csv_format', max_records_pre_file => 5);
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 csv/max_records.zst 0
+value BIGINT 1 csv/max_records.zst 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/csv/max_records.csv', file_format => 'head_csv_format', max_records_pre_file => 5);
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 csv/max_records.csv 0
+value BIGINT 1 csv/max_records.csv 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/csv/max_records.xz', file_format => 'head_csv_format', max_records_pre_file => 5);
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 csv/max_records.xz 0
+value BIGINT 1 csv/max_records.xz 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/csv/types.csv', file_format => 'head_csv_format')
 ----
-bool_col BOOLEAN 1 0
-int_col BIGINT 1 1
-float_col DOUBLE 1 2
-date_col DATE 1 3
-ts_sec TIMESTAMP 1 4
-ts_ms TIMESTAMP 1 5
-ts_us TIMESTAMP 1 6
-ts_ns TIMESTAMP 1 7
-utf8_col VARCHAR 1 8
-
-query TTBI
+bool_col BOOLEAN 1 csv/types.csv 0
+int_col BIGINT 1 csv/types.csv 1
+float_col DOUBLE 1 csv/types.csv 2
+date_col DATE 1 csv/types.csv 3
+ts_sec TIMESTAMP 1 csv/types.csv 4
+ts_ms TIMESTAMP 1 csv/types.csv 5
+ts_us TIMESTAMP 1 csv/types.csv 6
+ts_ns TIMESTAMP 1 csv/types.csv 7
+utf8_col VARCHAR 1 csv/types.csv 8
+
+query TTBTI
 select * from infer_schema(location => '@data/csv/merge/', file_format => 'head_csv_format');
 ----
-col1 VARCHAR 1 0
-col2 VARCHAR 1 1
-col3 VARCHAR 1 2
-col4 VARCHAR 1 3
-col5 VARCHAR 1 4
-
-query TTBI
-select * from infer_schema(location => '@data/csv/max_file_count/', file_format => 'head_csv_format', max_file_count => 2);
-----
-col1 BIGINT 1 0
-col2 BIGINT 1 1
-col3 BIGINT 1 2
-col4 BIGINT 1 3
-col5 BIGINT 1 4
+col1 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 0
+col2 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 1
+col3 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 2
+col4 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 3
+col5 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 4
+
+query T
+select CASE
+               WHEN filenames LIKE '%,%'
+               THEN 'Y'
+               ELSE 'N'
+           END AS format_check
+from infer_schema(location => '@data/csv/max_file_count/', file_format => 'head_csv_format', max_file_count => 2);
+----
+Y
+Y
+Y
+Y
+Y
 
 # NDJSON
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON');
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 ndjson/numbers.ndjson 0
+value BIGINT 1 ndjson/numbers.ndjson 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/ndjson/ragged.ndjson', file_format => 'NDJSON');
 ----
-id BIGINT 1 0
-value BIGINT 1 1
-comment VARCHAR 1 2
+id BIGINT 1 ndjson/ragged.ndjson 0
+value BIGINT 1 ndjson/ragged.ndjson 1
+comment VARCHAR 1 ndjson/ragged.ndjson 2
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON');
 ----
-id BIGINT 1 0
-value VARCHAR 1 1
+id BIGINT 1 ndjson/max_records.ndjson 0
+value VARCHAR 1 ndjson/max_records.ndjson 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5);
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 ndjson/max_records.ndjson 0
+value BIGINT 1 ndjson/max_records.ndjson 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/ndjson/max_records.zip', file_format => 'NDJSON', max_records_pre_file => 5);
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 ndjson/max_records.zip 0
+value BIGINT 1 ndjson/max_records.zip 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/ndjson/max_records.zst', file_format => 'NDJSON', max_records_pre_file => 5);
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 ndjson/max_records.zst 0
+value BIGINT 1 ndjson/max_records.zst 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/ndjson/max_records.ndjson', file_format => 'NDJSON', max_records_pre_file => 5);
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 ndjson/max_records.ndjson 0
+value BIGINT 1 ndjson/max_records.ndjson 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/ndjson/max_records.xz', file_format => 'NDJSON', max_records_pre_file => 5)
 ----
-id BIGINT 1 0
-value BIGINT 1 1
+id BIGINT 1 ndjson/max_records.xz 0
+value BIGINT 1 ndjson/max_records.xz 1
 
-query TTBI
+query TTBTI
 select * from infer_schema(location => '@data/ndjson/types.ndjson', file_format => 'NDJSON')
 ----
-bool_col BOOLEAN 1 0
-int_col BIGINT 1 1
-float_col DOUBLE 1 2
-date_col VARCHAR 1 3
-ts_sec VARCHAR 1 4
-ts_ms VARCHAR 1 5
-ts_us VARCHAR 1 6
-ts_ns VARCHAR 1 7
-utf8_col VARCHAR 1 8
-arr_col ARRAY(STRING) 1 9
-obj_col TUPLE(A INT64, B STRING) 1 10
-
-query TTBI
+bool_col BOOLEAN 1 ndjson/types.ndjson 0
+int_col BIGINT 1 ndjson/types.ndjson 1
+float_col DOUBLE 1 ndjson/types.ndjson 2
+date_col VARCHAR 1 ndjson/types.ndjson 3
+ts_sec VARCHAR 1 ndjson/types.ndjson 4
+ts_ms VARCHAR 1 ndjson/types.ndjson 5
+ts_us VARCHAR 1 ndjson/types.ndjson 6
+ts_ns VARCHAR 1 ndjson/types.ndjson 7
+utf8_col VARCHAR 1 ndjson/types.ndjson 8
+arr_col ARRAY(STRING) 1 ndjson/types.ndjson 9
+obj_col TUPLE(A INT64, B STRING) 1 ndjson/types.ndjson 10
+
+query TTBTI
 select * from infer_schema(location => '@data/ndjson/merge/', file_format => 'NDJSON');
 ----
-col1 VARCHAR 1 0
-col2 VARCHAR 1 1
-col3 VARCHAR 1 2
-col4 VARCHAR 1 3
-col5 VARCHAR 1 4
+col1 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 0
+col2 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 1
+col3 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 2
+col4 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 3
+col5 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 4
+
+query T
+select CASE
+               WHEN filenames LIKE '%,%'
+               THEN 'Y'
+               ELSE 'N'
+           END AS format_check
+from infer_schema(location => '@data/ndjson/max_file_count/', file_format => 'NDJSON', max_file_count => 2);
+----
+Y
+Y
\ No newline at end of file

From 4b6ef6d8e9cc9a0e77feeef7c46c0e07c6d87322 Mon Sep 17 00:00:00 2001
From: kould <kould2333@gmail.com>
Date: Tue, 9 Sep 2025 20:34:36 +0800
Subject: [PATCH 20/20] chore: codefmt

---
 .../stage/formats/parquet/infer_schema.test   | 24 +++++++++----------
 .../options/parquet_missing_field.test        | 14 +++++------
 .../formats/parquet/parquet_field_types.test  | 14 +++++------
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
index cf67ad1ea49ca..9113d03729c9e 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test
@@ -147,13 +147,13 @@ ts_ns TIMESTAMP 1 csv/types.csv 7
 utf8_col VARCHAR 1 csv/types.csv 8
 
 query TTBTI
-select * from infer_schema(location => '@data/csv/merge/', file_format => 'head_csv_format');
+select column_name, type, nullable, order_id from infer_schema(location => '@data/csv/merge/', file_format => 'head_csv_format');
 ----
-col1 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 0
-col2 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 1
-col3 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 2
-col4 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 3
-col5 VARCHAR 1 csv/merge/numbers_with_last_string.csv, csv/merge/numbers.csv 4
+col1 VARCHAR 1 0
+col2 VARCHAR 1 1
+col3 VARCHAR 1 2
+col4 VARCHAR 1 3
+col5 VARCHAR 1 4
 
 query T
 select CASE
@@ -235,13 +235,13 @@ arr_col ARRAY(STRING) 1 ndjson/types.ndjson 9
 obj_col TUPLE(A INT64, B STRING) 1 ndjson/types.ndjson 10
 
 query TTBTI
-select * from infer_schema(location => '@data/ndjson/merge/', file_format => 'NDJSON');
+select column_name, type, nullable, order_id from infer_schema(location => '@data/ndjson/merge/', file_format => 'NDJSON');
 ----
-col1 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 0
-col2 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 1
-col3 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 2
-col4 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 3
-col5 VARCHAR 1 ndjson/merge/numbers_with_last_string.ndjson, ndjson/merge/numbers.ndjson 4
+col1 VARCHAR 1 0
+col2 VARCHAR 1 1
+col3 VARCHAR 1 2
+col4 VARCHAR 1 3
+col5 VARCHAR 1 4
 
 query T
 select CASE
diff --git a/tests/sqllogictests/suites/stage/formats/parquet/options/parquet_missing_field.test b/tests/sqllogictests/suites/stage/formats/parquet/options/parquet_missing_field.test
index 522f9167dc50b..9e5e2344678a3 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/options/parquet_missing_field.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/options/parquet_missing_field.test
@@ -7,17 +7,17 @@ create table t1 (c1 int, c2 int, c3 int64, c4 string default 'ok')
 query 
 select * from infer_schema(location => '@data/parquet/diff_schema/f1.parquet')
 ----
-c1 BIGINT 1 0
-c2 SMALLINT 1 1
-c3 BIGINT 1 2
+c1 BIGINT 1 parquet/diff_schema/f1.parquet 0
+c2 SMALLINT 1 parquet/diff_schema/f1.parquet 1
+c3 BIGINT 1 parquet/diff_schema/f1.parquet 2
 
 query 
 select * from infer_schema(location => '@data/parquet/diff_schema/f2.parquet')
 ----
-c6 BIGINT 1 0
-c5 BIGINT 1 1
-c2 BIGINT 1 2
-c4 VARCHAR 1 3
+c6 BIGINT 1 parquet/diff_schema/f2.parquet 0
+c5 BIGINT 1 parquet/diff_schema/f2.parquet 1
+c2 BIGINT 1 parquet/diff_schema/f2.parquet 2
+c4 VARCHAR 1 parquet/diff_schema/f2.parquet 3
 
 query error
 copy into t1 from @data/parquet/diff_schema/ file_format=(type=parquet) pattern='.*[.]parquet'
diff --git a/tests/sqllogictests/suites/stage/formats/parquet/parquet_field_types.test b/tests/sqllogictests/suites/stage/formats/parquet/parquet_field_types.test
index 1565b70444b99..94e2bc39feb0c 100644
--- a/tests/sqllogictests/suites/stage/formats/parquet/parquet_field_types.test
+++ b/tests/sqllogictests/suites/stage/formats/parquet/parquet_field_types.test
@@ -106,13 +106,13 @@ NULL
 query 
 select * from infer_schema (location => '@data/parquet/int96.parquet')
 ----
-id VARCHAR 1 0
-t_bool BOOLEAN 1 1
-t_float FLOAT 1 2
-t_double DOUBLE 1 3
-t_timestamp TIMESTAMP 1 4
-t_data DATE 1 5
-t_array ARRAY(INT32) 1 6
+id VARCHAR 1 parquet/int96.parquet 0
+t_bool BOOLEAN 1 parquet/int96.parquet 1
+t_float FLOAT 1 parquet/int96.parquet 2
+t_double DOUBLE 1 parquet/int96.parquet 3
+t_timestamp TIMESTAMP 1 parquet/int96.parquet 4
+t_data DATE 1 parquet/int96.parquet 5
+t_array ARRAY(INT32) 1 parquet/int96.parquet 6
 
 # the physical type of column t_timestamp is INT96
 query