from FileUtils import FileUtils, OutputFileType
# Initialize with default configuration
file_utils = FileUtils()
# Load data from different formats
df_csv = file_utils.load_single_file("data.csv", input_type="raw")
df_excel = file_utils.load_single_file("data.xlsx", input_type="raw")
df_parquet = file_utils.load_single_file("data.parquet", input_type="raw")
df_json = file_utils.load_single_file("data.json", input_type="raw")
df_yaml = file_utils.load_single_file("data.yaml", input_type="raw")
#
# Save data
file_utils.save_data_to_storage(
data=df, # Single DataFrame
file_name="output",
output_type="processed",
output_filetype=OutputFileType.CSV
)
# Save multiple DataFrames
file_utils.save_data_to_storage(
data={"Sheet1": df1, "Sheet2": df2}, # Dictionary of DataFrames
file_name="multi_output",
output_type="processed",
output_filetype=OutputFileType.XLSX
)
# Save to a specific subdirectory dynamically
file_utils.save_data_to_storage(
data=df,
file_name="report_summary",
output_type="processed",
output_filetype=OutputFileType.CSV,
sub_path="run_1/results" # Creates data/processed/run_1/results/
)
# Load data from the specific subdirectory
loaded_df = file_utils.load_single_file(
file_path="report_summary.csv", # Just the filename
input_type="processed",
sub_path="run_1/results" # Specify the sub_path
)
# Enhanced DOCX with Template Support
markdown_content = """# Project Report
## Executive Summary
This is a comprehensive analysis of our project progress.
## Key Findings
- **Important**: We've achieved 95% completion
- [ ] Complete final testing
- [x] Update documentation
| Metric | Value | Status |
|--------|-------|--------|
| Progress | 95% | ✅ On Track |
| Budget | $45,000 | ✅ Under Budget |
"""
# Convert markdown to DOCX with template
saved_path, _ = file_utils.save_document_to_storage(
content=markdown_content,
output_filetype=OutputFileType.DOCX,
output_type="processed",
file_name="project_report",
template="review", # Use specific template
add_provenance=True,
add_reviewer_instructions=True
)
# Load multiple files from a subdirectory
# Assume data_a.csv and data_b.csv exist in data/raw/source_x/files/
multi_loaded = file_utils.load_multiple_files(
file_paths=["data_a.csv", "data_b.csv"], # Filenames only
input_type="raw",
sub_path="source_x/files"
)The file_exists() method provides a safe way to check if files exist without raising exceptions:
# Check if a config file exists
if file_utils.file_exists("ACME.config-defaults.yml", input_type="config", sub_path="ACME"):
config = file_utils.load_yaml("ACME.config-defaults.yml", input_type="config", sub_path="ACME")
# Check template file at root level
if file_utils.file_exists("template.pptx", input_type="templates", sub_path="ADM", root_level=True):
template = file_utils.load_document_from_storage("template.pptx", input_type="templates", sub_path="ADM", root_level=True)
# Check absolute path
if file_utils.file_exists("/absolute/path/to/file.yml"):
data = file_utils.load_yaml("/absolute/path/to/file.yml")Note: file_exists() never raises exceptions - it returns False on any error (file not found, permission errors, etc.).
The list_directory() method allows you to scan directories and filter results:
# List all config files for a customer
config_files = file_utils.list_directory(
input_type="config",
sub_path="ACME",
pattern="*.yml"
)
# Returns: ["ACME.config-defaults.yml", "ACME.settings.yml", ...]
# List templates in customer directory (root level)
templates = file_utils.list_directory(
input_type="templates",
sub_path="ADM",
root_level=True,
pattern="*.pptx"
)
# Returns: ["ADP-template_ADM.pptx", "report-template_ADM.pptx", ...]
# List only files (exclude directories)
files = file_utils.list_directory(
input_type="raw",
files_only=True
)
# List only directories
dirs = file_utils.list_directory(
input_type="raw",
directories_only=True
)
# List using absolute path
items = file_utils.list_directory("/absolute/path/to/dir", pattern="*.csv")Note: list_directory() never raises exceptions - it returns an empty list on any error.
The create_directory() method now supports more flexible directory creation:
# Create directory with new signature
dir_path = file_utils.create_directory(
"charts",
input_type="processed",
sub_path="presentations/ACME/run123"
)
# Create directory at root level
dir_path = file_utils.create_directory(
"output",
input_type="reports",
root_level=True
)
# Create nested directories (parent directories created automatically)
dir_path = file_utils.create_directory(
"level3",
input_type="processed",
sub_path="level1/level2"
)
# Legacy usage still works
dir_path = file_utils.create_directory("features", parent_dir="data")from FileUtils.core.enums import OutputArea
chart_path = file_utils.save_bytes(
content=png_bytes,
file_stem="chart_q1",
sub_path="runs/acme/images",
output_type=OutputArea.PROCESSED, # or "processed"
file_ext="png",
)from FileUtils import SaveResult
res_map, _ = file_utils.save_data_to_storage(
data={"Sheet1": df1, "Sheet2": df2},
output_filetype=OutputFileType.XLSX,
file_name="multi_sheet",
structured_result=True,
)
assert isinstance(next(iter(res_map.values())), SaveResult)from FileUtils.core.enums import InputType, OutputArea
file_utils.load_document_from_storage("readme.md", input_type=InputType.RAW)
file_utils.save_document_to_storage("# notes", OutputFileType.MARKDOWN, output_type=OutputArea.PROCESSED)# Save CSV with custom delimiter
file_utils.save_data_to_storage(
data=df,
file_name="output",
output_filetype=OutputFileType.CSV,
encoding="utf-8",
sep="|" # Custom delimiter
)
# Load CSV (delimiter is auto-detected)
df = file_utils.load_single_file("data.csv")# Save multiple sheets to Excel
data_dict = {
"Sheet1": df1,
"Sheet2": df2
}
file_utils.save_data_to_storage(
data=data_dict,
file_name="multi_sheet",
output_filetype=OutputFileType.XLSX
)
# Load all sheets from Excel
sheets_dict = file_utils.load_excel_sheets("multi_sheet.xlsx")Convert Excel workbooks with multiple worksheets to CSV files while preserving workbook structure and metadata.
# Convert Excel workbook to CSV files with structure preservation
csv_files, structure_file = file_utils.convert_excel_to_csv_with_structure(
excel_file_path="workbook.xlsx",
file_name="converted_workbook",
preserve_structure=True
)
# Result:
# csv_files = {
# "Sheet1": "data/processed/converted_workbook_Sheet1.csv",
# "Sheet2": "data/processed/converted_workbook_Sheet2.csv"
# }
# structure_file = "data/processed/converted_workbook_structure.json"
# Load converted data
employees_df = file_utils.load_single_file(
"converted_workbook_Employees.csv",
input_type="processed"
)Structure JSON includes:
- Workbook metadata (source file, conversion timestamp, sheet count)
- Sheet details (dimensions, columns, data types, null counts)
- Data quality metrics (memory usage, index information)
Reconstruct Excel workbooks from modified CSV files using the structure JSON created during conversion.
# Reconstruct Excel workbook from modified CSV files
excel_path = file_utils.convert_csv_to_excel_workbook(
structure_json_path=structure_file,
file_name="reconstructed_workbook"
)
# The method creates:
# - Excel workbook with all sheets
# - Reconstruction metadata JSON
# - Handles missing files gracefullyReconstruction Features:
- Uses structure JSON to locate and load CSV files
- Handles missing or modified CSV files gracefully
- Creates reconstruction metadata for audit trail
- Maintains original sheet names and structure
Customize directory names to match your project domain and workflow.
# config.yaml
directories:
data_directory: "documents" # Main directory name
subdirectories:
raw: "product_docs" # Input directory
processed: "cs_documents" # Output directory
templates: "templates" # Template directory# Initialize with custom configuration
file_utils = FileUtils(config_file="config.yaml")
# All operations automatically use custom directories
file_utils.save_data_to_storage(data, output_filetype=OutputFileType.CSV,
output_type="raw") # → documents/product_docs/
file_utils.load_single_file("data.csv", input_type="raw") # → documents/product_docs/data.csv
# Excel ↔ CSV conversion works seamlessly
csv_files, structure_file = file_utils.convert_excel_to_csv_with_structure(
"workbook.xlsx", input_type="raw", output_type="processed"
)
# → documents/product_docs/workbook.xlsx → documents/cs_documents/*.csvDocument Processing:
directories:
data_directory: "documents"
subdirectories:
raw: "source_docs"
processed: "ai_processed"
templates: "templates"Content Creation:
directories:
data_directory: "assets"
subdirectories:
raw: "source_materials"
processed: "final_content"
templates: "brand_templates"Research Projects:
directories:
data_directory: "experiments"
subdirectories:
raw: "data_collection"
processed: "analysis_results"
templates: "report_templates"Existing projects continue to work unchanged. The default configuration uses the traditional data/ directory structure:
# Default behavior (unchanged)
file_utils = FileUtils()
# Uses: project_root/data/raw/ and project_root/data/processed/JSON files can be used in two ways: as DataFrame storage or as structured documents.
# Save JSON in different formats
file_utils.save_data_to_storage(
data=df,
file_name="records",
output_filetype=OutputFileType.JSON,
orient="records" # List of records format
)
file_utils.save_data_to_storage(
data=df,
file_name="index",
output_filetype=OutputFileType.JSON,
orient="index" # Dictionary format with index as keys
)
# Load JSON (format is auto-detected)
df = file_utils.load_single_file("data.json")# Save structured configuration as JSON document
config_data = {
"database": {
"host": "localhost",
"port": 5432,
"name": "analytics"
},
"api": {
"timeout": 30,
"retries": 3,
"base_url": "https://api.example.com"
},
"features": {
"enable_caching": True,
"cache_ttl": 3600,
"max_connections": 100
}
}
saved_path, _ = file_utils.save_document_to_storage(
content=config_data,
output_filetype=OutputFileType.JSON,
output_type="processed",
file_name="app_config"
)
# Load configuration
loaded_config = file_utils.load_json(
file_path="app_config.json",
input_type="processed"
)
print(f"Database host: {loaded_config['database']['host']}")import pandas as pd
import numpy as np
# Create data with pandas types
df = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=5),
'value': np.random.randn(5),
'category': ['A', 'B', 'C', 'D', 'E']
})
# This works without manual conversion!
json_data = {
'metadata': {
'created': pd.Timestamp.now(),
'version': '1.0',
'total_records': len(df)
},
'data': df.to_dict('records') # Pandas Timestamps automatically converted
}
saved_path, _ = file_utils.save_document_to_storage(
content=json_data,
output_filetype=OutputFileType.JSON,
output_type="processed",
file_name="data_with_types"
)
# Load the data
loaded_data = file_utils.load_json(
file_path="data_with_types.json",
input_type="processed"
)
print(f"Created: {loaded_data['metadata']['created']}") # ISO format stringYAML files can be used in two ways: as DataFrame storage or as structured documents.
# Save YAML with custom options
file_utils.save_data_to_storage(
data=df,
file_name="output",
output_filetype=OutputFileType.YAML,
yaml_options={
"default_flow_style": False,
"sort_keys": True,
"indent": 4
},
orient="records" # or "index"
)
# Load YAML as DataFrame
df = file_utils.load_single_file("data.yaml")# Save structured configuration as YAML document
pipeline_config = {
"project": {
"name": "Data Analysis Pipeline",
"version": "2.1.0",
"description": "Automated data processing and analysis"
},
"data_sources": {
"primary": {
"type": "database",
"connection": "postgresql://localhost:5432/analytics",
"tables": ["users", "transactions", "products"]
},
"secondary": {
"type": "api",
"url": "https://api.external-service.com",
"auth": {"type": "bearer", "token": "your-token"}
}
},
"processing": {
"batch_size": 1000,
"parallel_workers": 4,
"retry_attempts": 3,
"timeout": 300
},
"output": {
"formats": ["csv", "parquet"],
"compression": "gzip",
"include_metadata": True
}
}
saved_path, _ = file_utils.save_document_to_storage(
content=pipeline_config,
output_filetype=OutputFileType.YAML,
output_type="processed",
file_name="pipeline_config"
)
# Load configuration
loaded_config = file_utils.load_yaml(
file_path="pipeline_config.yaml",
input_type="processed"
)
print(f"Project: {loaded_config['project']['name']}")
print(f"Batch size: {loaded_config['processing']['batch_size']}")import pandas as pd
from datetime import datetime
# Create configuration with pandas types
config_with_types = {
"metadata": {
"created": pd.Timestamp.now(),
"last_updated": datetime.now(),
"version": "1.0"
},
"data_ranges": {
"start_date": pd.Timestamp('2024-01-01'),
"end_date": pd.Timestamp('2024-12-31'),
"frequency": "daily"
},
"settings": {
"debug": True,
"log_level": "INFO"
}
}
saved_path, _ = file_utils.save_document_to_storage(
content=config_with_types,
output_filetype=OutputFileType.YAML,
output_type="processed",
file_name="config_with_dates"
)
# Load the configuration
loaded_config = file_utils.load_yaml(
file_path="config_with_dates.yaml",
input_type="processed"
)
print(f"Created: {loaded_config['metadata']['created']}") # Properly formatted# Save Parquet with compression
file_utils.save_data_to_storage(
data=df,
file_name="output",
output_filetype=OutputFileType.PARQUET,
compression="snappy" # or "gzip", "brotli", etc.
)
# Load Parquet
df = file_utils.load_single_file("data.parquet")FileUtils now supports rich document formats perfect for AI/agentic workflows:
# Save simple markdown
markdown_content = """# Analysis Report
## Key Findings
- Model accuracy: 95.2%
- Processing time: 2.3 seconds
## Recommendations
1. Implement additional training data
2. Optimize inference pipeline
"""
saved_path, _ = file_utils.save_document_to_storage(
content=markdown_content,
output_filetype=OutputFileType.MARKDOWN,
output_type="processed",
file_name="analysis_report"
)
# Save markdown with YAML frontmatter
structured_content = {
"frontmatter": {
"title": "AI Analysis Report",
"author": "AI Agent",
"confidence": 0.95,
"timestamp": "2024-01-15T10:30:00Z"
},
"body": """# Analysis Results
## Summary
The analysis identified 3 key insights.
## Key Findings
- Pattern detected with 94.2% confidence
- 3 anomalies identified
- Recommended actions: Update model, retrain
"""
}
saved_path, _ = file_utils.save_document_to_storage(
content=structured_content,
output_filetype=OutputFileType.MARKDOWN,
output_type="processed",
file_name="structured_report",
sub_path="reports/2024"
)
# Load markdown
loaded_content = file_utils.load_document_from_storage(
file_path="analysis_report.md",
input_type="processed"
)# Save simple DOCX document
docx_content = "This is a test document for DOCX format."
saved_path, _ = file_utils.save_document_to_storage(
content=docx_content,
output_filetype=OutputFileType.DOCX,
output_type="processed",
file_name="simple_document"
)
# Save structured DOCX with headings and tables
structured_docx = {
"title": "Project Report",
"sections": [
{
"heading": "Executive Summary",
"level": 1,
"text": "Project completed successfully."
},
{
"heading": "Results",
"level": 2,
"table": [
["Metric", "Value", "Unit"],
["Accuracy", "95.2", "%"],
["Speed", "2.3", "seconds"],
["Memory", "512", "MB"]
]
}
]
}
saved_path, _ = file_utils.save_document_to_storage(
content=structured_docx,
output_filetype=OutputFileType.DOCX,
output_type="processed",
file_name="project_report"
)
# Load DOCX (extracts text content)
loaded_content = file_utils.load_document_from_storage(
file_path="simple_document.docx",
input_type="processed"
)# Save simple PDF
pdf_content = "This is a test document for PDF format."
saved_path, _ = file_utils.save_document_to_storage(
content=pdf_content,
output_filetype=OutputFileType.PDF,
output_type="processed",
file_name="simple_pdf"
)
# Save structured PDF
structured_pdf = {
"title": "Technical Documentation",
"sections": [
{
"heading": "Introduction",
"text": "This document provides technical specifications."
},
{
"heading": "Architecture",
"text": "The system follows a microservices architecture."
}
]
}
saved_path, _ = file_utils.save_document_to_storage(
content=structured_pdf,
output_filetype=OutputFileType.PDF,
output_type="processed",
file_name="technical_doc"
)
# Load PDF (extracts text content)
loaded_content = file_utils.load_document_from_storage(
file_path="simple_pdf.pdf",
input_type="processed"
)# Save PPTX from bytes
with open("slides.pptx", "rb") as f:
pptx_bytes = f.read()
saved_path, _ = file_utils.save_document_to_storage(
content=pptx_bytes,
output_filetype=OutputFileType.PPTX,
output_type="processed",
file_name="quarterly_review",
sub_path="presentations/2024/Q1"
)
# Or save PPTX from a local file path
saved_path, _ = file_utils.save_document_to_storage(
content="/absolute/path/to/slides.pptx",
output_filetype=OutputFileType.PPTX,
output_type="processed",
file_name="quarterly_review",
sub_path="presentations/2024/Q1"
)
# Load PPTX (returns bytes)
pptx_bytes = file_utils.load_document_from_storage(
file_path="quarterly_review.pptx",
input_type="processed",
sub_path="presentations/2024/Q1"
)Document functionality requires optional dependencies:
# Install document support
pip install 'FileUtils[documents]'
# Or install specific dependencies
pip install python-docx markdown PyMuPDFNote: Markdown functionality works without additional dependencies. DOCX and PDF require the optional packages. PPTX support operates on raw files and does not require additional dependencies.
FileUtils manages data in a structured directory layout:
project_root/
├── data/
│ ├── raw/ # Raw data files
│ ├── processed/ # Processed data files
│ └── interim/ # Intermediate data files
└── reports/
└── figures/ # Generated figures
You can create new directories within this structure:
# Create new directory under data/
features_dir = file_utils.create_directory("features")
# Create directory under specific parent
reports_dir = file_utils.create_directory("monthly", parent_dir="reports")
# Directory is added to configuration structure
print(file_utils.config["directory_structure"]["data"]) # Shows ['raw', 'processed', 'interim', 'features']You can override default settings using a config.yaml file:
# File handling
csv_delimiter: ","
encoding: "utf-8"
quoting: "minimal"
include_timestamp: false
# Logging
logging_level: "INFO"
# Directory structure
directory_structure:
data:
- raw
- processed
- interim
reports:
- figures
models:
- trainedFor Azure Blob Storage operations, see AZURE_SETUP.md.
FileUtils provides detailed error messages through custom exceptions:
StorageError: Base exception for storage operationsStorageOperationError: Specific operation failures (e.g., file not found, invalid format)ConfigurationError: Configuration-related issues
Example error handling:
from FileUtils.core.base import StorageError
try:
df = file_utils.load_single_file("nonexistent.csv")
except StorageError as e:
print(f"Failed to load file: {e}")Problem: TypeError: Object of type Timestamp is not JSON serializable
Solution: Use save_document_to_storage() instead of manual JSON serialization:
# ❌ This will fail
import json
data = {'date': pd.Timestamp.now()}
json.dumps(data) # TypeError
# ✅ This works automatically
saved_path, _ = file_utils.save_document_to_storage(
content=data,
output_filetype=OutputFileType.JSON,
output_type="processed",
file_name="data"
)Problem: ModuleNotFoundError: No module named 'docx' or ModuleNotFoundError: No module named 'fitz'
Solution: Install document dependencies:
# Install all document support
pip install 'FileUtils[documents]'
# Or install specific dependencies
pip install python-docx markdown PyMuPDFProblem: FileNotFoundError when loading files saved with timestamps
Solution: Use base filename - FileUtils automatically finds timestamped files:
# Save with timestamp (creates: report_20241018_143022.json)
saved_path, _ = file_utils.save_document_to_storage(
content=content,
output_filetype=OutputFileType.JSON,
file_name="report"
)
# Load by base name (automatically finds the timestamped file)
loaded_data = file_utils.load_json(
file_path="report.json", # Not "report_20241018_143022.json"
input_type="processed"
)Problem: NotImplementedError: Writing to Excel with MultiIndex columns and no index
Solution: FileUtils automatically handles MultiIndex columns by flattening them:
# FileUtils automatically flattens MultiIndex columns
df_with_multiindex = pd.DataFrame({
('A', 'x'): [1, 2, 3],
('A', 'y'): [4, 5, 6],
('B', 'z'): [7, 8, 9]
})
# This works automatically
saved_files, metadata = file_utils.save_data_to_storage(
data={'data': df_with_multiindex},
output_filetype=OutputFileType.XLSX,
output_type="processed",
file_name="multiindex_data"
)Problem: StorageConnectionError when using Azure storage
Solution: Check your connection string and credentials:
from FileUtils.core.base import StorageConnectionError
try:
azure_utils = FileUtils(
storage_type="azure",
connection_string="your_connection_string"
)
except StorageConnectionError as e:
print(f"Azure connection failed: {e}")
# Fall back to local storage
file_utils = FileUtils(storage_type="local")Problem: Configuration not loading or validation errors
Solution: Check your configuration file format:
# Validate configuration
try:
file_utils = FileUtils(config_file="config.yaml")
except Exception as e:
print(f"Configuration error: {e}")
# Use default configuration
file_utils = FileUtils()For large files, consider these optimizations:
# Use Parquet for large datasets
file_utils.save_data_to_storage(
data=large_df,
output_filetype=OutputFileType.PARQUET,
compression="snappy" # Fast compression
)
# Use chunked processing for very large files
chunk_size = 10000
for i, chunk in enumerate(pd.read_csv("large_file.csv", chunksize=chunk_size)):
file_utils.save_data_to_storage(
data={'chunk': chunk},
output_filetype=OutputFileType.PARQUET,
file_name=f"chunk_{i:04d}"
)# For large DataFrames, use appropriate data types
df = df.astype({
'category': 'category', # Reduces memory usage
'id': 'int32', # Instead of int64
'price': 'float32' # Instead of float64
})