1- use std:: path:: { Path , PathBuf } ;
21use datafusion:: arrow:: error:: ArrowError ;
32use datafusion:: dataframe:: DataFrameWriteOptions ;
4- use datafusion:: prelude:: * ;
53use datafusion:: error:: DataFusionError ;
6- use thiserror:: Error ;
7- use tempfile:: NamedTempFile ;
4+ use datafusion:: prelude:: * ;
85use reqwest:: Client ;
6+ use std:: path:: { Path , PathBuf } ;
7+ use tempfile:: NamedTempFile ;
8+ use thiserror:: Error ;
99
1010#[ derive( Debug , PartialEq , Eq ) ]
1111pub enum FileFormat {
@@ -47,21 +47,28 @@ pub enum DfKitError {
4747 Reqwest ( #[ from] reqwest:: Error ) ,
4848}
4949
50- pub fn file_type (
51- file_path : & Path ,
52- ) -> Result < FileFormat , FileParseError > {
53- match Path :: new ( file_path) . extension ( ) . and_then ( |ext| ext. to_str ( ) ) {
54- Some ( "csv" ) => Ok ( FileFormat :: Csv ) ,
55- Some ( "parquet" ) => Ok ( FileFormat :: Parquet ) ,
56- Some ( "json" ) => Ok ( FileFormat :: Json ) ,
57- Some ( "avro" ) => Ok ( FileFormat :: Avro ) ,
58- Some ( _) => Err ( FileParseError :: UnsupportedFileFormat ) ,
59- None => Err ( FileParseError :: InvalidExtension ) ,
60- }
50+ pub fn file_type ( file_path : & Path ) -> Result < FileFormat , FileParseError > {
51+ match Path :: new ( file_path)
52+ . extension ( )
53+ . and_then ( |ext| ext. to_str ( ) )
54+ {
55+ Some ( "csv" ) => Ok ( FileFormat :: Csv ) ,
56+ Some ( "parquet" ) => Ok ( FileFormat :: Parquet ) ,
57+ Some ( "json" ) => Ok ( FileFormat :: Json ) ,
58+ Some ( "avro" ) => Ok ( FileFormat :: Avro ) ,
59+ Some ( _) => Err ( FileParseError :: UnsupportedFileFormat ) ,
60+ None => Err ( FileParseError :: InvalidExtension ) ,
61+ }
6162}
6263
63- pub async fn register_table ( ctx : & SessionContext , table_name : & str , file_path : & Path ) -> Result < DataFrame , DfKitError > {
64- let path_str = file_path. to_str ( ) . ok_or ( DfKitError :: FileParse ( FileParseError :: InvalidExtension ) ) ?;
64+ pub async fn register_table (
65+ ctx : & SessionContext ,
66+ table_name : & str ,
67+ file_path : & Path ,
68+ ) -> Result < DataFrame , DfKitError > {
69+ let path_str = file_path
70+ . to_str ( )
71+ . ok_or ( DfKitError :: FileParse ( FileParseError :: InvalidExtension ) ) ?;
6572 let is_url = path_str. starts_with ( "http://" ) || path_str. starts_with ( "https://" ) ;
6673
6774 let actual_path = if is_url {
@@ -72,20 +79,38 @@ pub async fn register_table(ctx: &SessionContext, table_name: &str, file_path: &
7279 } ;
7380
7481 let file_format = file_type ( & actual_path) ?;
75- let file_name = actual_path. to_str ( ) . ok_or ( DfKitError :: FileParse ( FileParseError :: InvalidExtension ) ) ?;
82+ let file_name = actual_path
83+ . to_str ( )
84+ . ok_or ( DfKitError :: FileParse ( FileParseError :: InvalidExtension ) ) ?;
7685 match file_format {
77- FileFormat :: Csv => ctx. register_csv ( table_name, file_name, CsvReadOptions :: default ( ) ) . await ?,
78- FileFormat :: Parquet => ctx. register_parquet ( table_name, file_name, ParquetReadOptions :: default ( ) ) . await ?,
79- FileFormat :: Json => ctx. register_json ( table_name, file_name, NdJsonReadOptions :: default ( ) ) . await ?,
80- FileFormat :: Avro => ctx. register_avro ( table_name, file_name, AvroReadOptions :: default ( ) ) . await ?,
86+ FileFormat :: Csv => {
87+ ctx. register_csv ( table_name, file_name, CsvReadOptions :: default ( ) )
88+ . await ?
89+ }
90+ FileFormat :: Parquet => {
91+ ctx. register_parquet ( table_name, file_name, ParquetReadOptions :: default ( ) )
92+ . await ?
93+ }
94+ FileFormat :: Json => {
95+ ctx. register_json ( table_name, file_name, NdJsonReadOptions :: default ( ) )
96+ . await ?
97+ }
98+ FileFormat :: Avro => {
99+ ctx. register_avro ( table_name, file_name, AvroReadOptions :: default ( ) )
100+ . await ?
101+ }
81102 } ;
82103
83104 Ok ( ctx. table ( table_name) . await ?)
84105}
85106
86- pub fn parse_file_list ( files : Option < String > , dir : Option < PathBuf > ) -> Result < Vec < PathBuf > , DfKitError > {
107+ pub fn parse_file_list (
108+ files : Option < String > ,
109+ dir : Option < PathBuf > ,
110+ ) -> Result < Vec < PathBuf > , DfKitError > {
87111 if let Some ( file_str) = files {
88- Ok ( file_str. split ( ',' )
112+ Ok ( file_str
113+ . split ( ',' )
89114 . map ( |s| PathBuf :: from ( s. trim ( ) ) )
90115 . collect ( ) )
91116 } else if let Some ( dir_path) = dir {
@@ -102,17 +127,46 @@ pub fn parse_file_list(files: Option<String>, dir: Option<PathBuf>) -> Result<Ve
102127 }
103128 Ok ( file_list)
104129 } else {
105- Err ( DfKitError :: CustomError ( "No files or directory provided" . into ( ) ) )
130+ Err ( DfKitError :: CustomError (
131+ "No files or directory provided" . into ( ) ,
132+ ) )
106133 }
107134}
108135
109- pub async fn write_output ( df : DataFrame , out_path : & Path , format : & FileFormat ) -> Result < ( ) , DfKitError > {
136+ pub async fn write_output (
137+ df : DataFrame ,
138+ out_path : & Path ,
139+ format : & FileFormat ,
140+ ) -> Result < ( ) , DfKitError > {
110141 match format {
111- FileFormat :: Csv => df. write_csv ( out_path. to_str ( ) . unwrap ( ) , DataFrameWriteOptions :: default ( ) , None ) . await ?,
112- FileFormat :: Parquet => df. write_parquet ( out_path. to_str ( ) . unwrap ( ) , DataFrameWriteOptions :: default ( ) , None ) . await ?,
113- FileFormat :: Json => df. write_json ( out_path. to_str ( ) . unwrap ( ) , DataFrameWriteOptions :: default ( ) , None ) . await ?,
142+ FileFormat :: Csv => {
143+ df. write_csv (
144+ out_path. to_str ( ) . unwrap ( ) ,
145+ DataFrameWriteOptions :: default ( ) ,
146+ None ,
147+ )
148+ . await ?
149+ }
150+ FileFormat :: Parquet => {
151+ df. write_parquet (
152+ out_path. to_str ( ) . unwrap ( ) ,
153+ DataFrameWriteOptions :: default ( ) ,
154+ None ,
155+ )
156+ . await ?
157+ }
158+ FileFormat :: Json => {
159+ df. write_json (
160+ out_path. to_str ( ) . unwrap ( ) ,
161+ DataFrameWriteOptions :: default ( ) ,
162+ None ,
163+ )
164+ . await ?
165+ }
114166 FileFormat :: Avro => {
115- return Err ( DfKitError :: DataFusion ( DataFusionError :: NotImplemented ( "Avro write not supported" . into ( ) ) ) ) ;
167+ return Err ( DfKitError :: DataFusion ( DataFusionError :: NotImplemented (
168+ "Avro write not supported" . into ( ) ,
169+ ) ) ) ;
116170 }
117171 } ;
118172 Ok ( ( ) )
@@ -122,13 +176,17 @@ pub async fn download_to_tempfile(url: &str) -> Result<(NamedTempFile, PathBuf),
122176 let response = Client :: new ( ) . get ( url) . send ( ) . await ?. bytes ( ) . await ?;
123177
124178 // Try to extract the file extension from the URL
125- let ext = url. split ( '.' ) . last ( ) . and_then ( |e| {
126- let e = e. split ( '?' ) . next ( ) . unwrap_or ( e) ; // strip query string
127- match e {
128- "csv" | "json" | "parquet" | "avro" => Some ( e) ,
129- _ => None ,
130- }
131- } ) . ok_or ( FileParseError :: InvalidExtension ) ?;
179+ let ext = url
180+ . split ( '.' )
181+ . last ( )
182+ . and_then ( |e| {
183+ let e = e. split ( '?' ) . next ( ) . unwrap_or ( e) ; // strip query string
184+ match e {
185+ "csv" | "json" | "parquet" | "avro" => Some ( e) ,
186+ _ => None ,
187+ }
188+ } )
189+ . ok_or ( FileParseError :: InvalidExtension ) ?;
132190
133191 // Create temp file with extension
134192 let tempfile = NamedTempFile :: new ( ) ?;
@@ -140,4 +198,3 @@ pub async fn download_to_tempfile(url: &str) -> Result<(NamedTempFile, PathBuf),
140198
141199 Ok ( ( tempfile, path_with_ext) )
142200}
143-
0 commit comments