1313// limitations under the License.
1414
1515use std:: collections:: BTreeMap ;
16+ use std:: io:: Cursor ;
1617use std:: sync:: Arc ;
1718
19+ use arrow_csv:: reader:: Format ;
20+ use arrow_json:: reader:: infer_json_schema;
21+ use arrow_schema:: Schema as ArrowSchema ;
1822use databend_common_ast:: ast:: FileLocation ;
1923use databend_common_ast:: ast:: UriLocation ;
2024use databend_common_catalog:: table_context:: TableContext ;
@@ -26,7 +30,8 @@ use databend_common_expression::types::UInt64Type;
2630use databend_common_expression:: DataBlock ;
2731use databend_common_expression:: FromData ;
2832use databend_common_expression:: TableSchema ;
29- use databend_common_meta_app:: principal:: StageFileFormatType ;
33+ use databend_common_meta_app:: principal:: CsvFileFormatParams ;
34+ use databend_common_meta_app:: principal:: FileFormatParams ;
3035use databend_common_meta_app:: principal:: StageType ;
3136use databend_common_pipeline_core:: processors:: OutputPort ;
3237use databend_common_pipeline_core:: processors:: ProcessorPtr ;
@@ -37,24 +42,25 @@ use databend_common_storage::init_stage_operator;
3742use databend_common_storage:: read_parquet_schema_async_rs;
3843use databend_common_storage:: StageFilesInfo ;
3944use databend_common_users:: Object ;
45+ use opendal:: Operator ;
4046use opendal:: Scheme ;
4147
4248use crate :: table_functions:: infer_schema:: infer_schema_table:: INFER_SCHEMA ;
4349use crate :: table_functions:: infer_schema:: table_args:: InferSchemaArgsParsed ;
4450
45- pub ( crate ) struct ParquetInferSchemaSource {
51+ pub ( crate ) struct InferSchemaSource {
4652 is_finished : bool ,
4753 ctx : Arc < dyn TableContext > ,
4854 args_parsed : InferSchemaArgsParsed ,
4955}
5056
51- impl ParquetInferSchemaSource {
57+ impl InferSchemaSource {
5258 pub fn create (
5359 ctx : Arc < dyn TableContext > ,
5460 output : Arc < OutputPort > ,
5561 args_parsed : InferSchemaArgsParsed ,
5662 ) -> Result < ProcessorPtr > {
57- AsyncSourcer :: create ( ctx. clone ( ) , output, ParquetInferSchemaSource {
63+ AsyncSourcer :: create ( ctx. clone ( ) , output, InferSchemaSource {
5864 is_finished : false ,
5965 ctx,
6066 args_parsed,
@@ -63,7 +69,7 @@ impl ParquetInferSchemaSource {
6369}
6470
6571#[ async_trait:: async_trait]
66- impl AsyncSource for ParquetInferSchemaSource {
72+ impl AsyncSource for InferSchemaSource {
6773 const NAME : & ' static str = INFER_SCHEMA ;
6874
6975 #[ async_backtrace:: framed]
@@ -127,9 +133,9 @@ impl AsyncSource for ParquetInferSchemaSource {
127133 Some ( f) => self . ctx . get_file_format ( f) . await ?,
128134 None => stage_info. file_format_params . clone ( ) ,
129135 } ;
130- let schema = match ( first_file. as_ref ( ) , file_format_params. get_type ( ) ) {
136+ let schema = match ( first_file. as_ref ( ) , file_format_params) {
131137 ( None , _) => return Ok ( None ) ,
132- ( Some ( first_file) , StageFileFormatType :: Parquet ) => {
138+ ( Some ( first_file) , FileFormatParams :: Parquet ( _ ) ) => {
133139 let arrow_schema = read_parquet_schema_async_rs (
134140 & operator,
135141 & first_file. path ,
@@ -138,6 +144,27 @@ impl AsyncSource for ParquetInferSchemaSource {
138144 . await ?;
139145 TableSchema :: try_from ( & arrow_schema) ?
140146 }
147+ ( Some ( first_file) , FileFormatParams :: Csv ( params) ) => {
148+ let arrow_schema = read_csv_metadata_async (
149+ & first_file. path ,
150+ & operator,
151+ Some ( first_file. size ) ,
152+ self . args_parsed . max_records ,
153+ & params,
154+ )
155+ . await ?;
156+ TableSchema :: try_from ( & arrow_schema) ?
157+ }
158+ ( Some ( first_file) , FileFormatParams :: NdJson ( _) ) => {
159+ let arrow_schema = read_json_metadata_async (
160+ & first_file. path ,
161+ & operator,
162+ Some ( first_file. size ) ,
163+ self . args_parsed . max_records ,
164+ )
165+ . await ?;
166+ TableSchema :: try_from ( & arrow_schema) ?
167+ }
141168 _ => {
142169 return Err ( ErrorCode :: BadArguments (
143170 "infer_schema is currently limited to format Parquet" ,
@@ -168,3 +195,52 @@ impl AsyncSource for ParquetInferSchemaSource {
168195 Ok ( Some ( block) )
169196 }
170197}
198+
199+ pub async fn read_csv_metadata_async (
200+ path : & str ,
201+ operator : & Operator ,
202+ file_size : Option < u64 > ,
203+ max_records : Option < usize > ,
204+ params : & CsvFileFormatParams ,
205+ ) -> Result < ArrowSchema > {
206+ let file_size = match file_size {
207+ None => operator. stat ( path) . await ?. content_length ( ) ,
208+ Some ( n) => n,
209+ } ;
210+ let escape = if params. escape . is_empty ( ) {
211+ None
212+ } else {
213+ Some ( params. escape . as_bytes ( ) [ 0 ] )
214+ } ;
215+
216+ // TODO: It would be better if it could be read in the form of Read trait
217+ let buf = operator. read_with ( path) . range ( ..file_size) . await ?. to_vec ( ) ;
218+ let mut format = Format :: default ( )
219+ . with_delimiter ( params. field_delimiter . as_bytes ( ) [ 0 ] )
220+ . with_quote ( params. quote . as_bytes ( ) [ 0 ] )
221+ . with_header ( params. headers != 0 ) ;
222+
223+ if let Some ( escape) = escape {
224+ format = format. with_escape ( escape) ;
225+ }
226+ let ( schema, _) = format. infer_schema ( Cursor :: new ( & buf) , max_records) ?;
227+
228+ Ok ( schema)
229+ }
230+
231+ pub async fn read_json_metadata_async (
232+ path : & str ,
233+ operator : & Operator ,
234+ file_size : Option < u64 > ,
235+ max_records : Option < usize > ,
236+ ) -> Result < ArrowSchema > {
237+ let file_size = match file_size {
238+ None => operator. stat ( path) . await ?. content_length ( ) ,
239+ Some ( n) => n,
240+ } ;
241+ // TODO: It would be better if it could be read in the form of Read trait
242+ let buf = operator. read_with ( path) . range ( ..file_size) . await ?. to_vec ( ) ;
243+ let ( schema, _) = infer_json_schema ( Cursor :: new ( & buf) , max_records) ?;
244+
245+ Ok ( schema)
246+ }
0 commit comments