Skip to content

Commit bb2524a

Browse files
authored
Merge pull request #5 from ahamptonTIA/20240320
v0.0.4
2 parents acb59a0 + 9291aa6 commit bb2524a

1 file changed

Lines changed: 61 additions & 2 deletions

File tree

src/schema_validata.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import ast
55
import math
66
import hashlib
7+
import chardet
78
import re
89
import warnings
910
from datetime import datetime
@@ -577,6 +578,53 @@ def infer_datetime_column(df,
577578

578579
# ----------------------------------------------------------------------------------
579580

581+
def detect_file_encoding(file_path):
582+
"""Detects the character encoding of a text-based file using chardet library.
583+
584+
This function is useful for determining the appropriate encoding when reading
585+
files that may not explicitly declare their encoding. It analyzes a sample
586+
of the file's content to identify the most likely character encoding scheme
587+
used.
588+
589+
Parameters:
590+
----------
591+
file_path (str):
592+
The path to the target file.
593+
Returns:
594+
----------
595+
str:
596+
The detected character encoding of the file. If chardet cannot
597+
determine the encoding with sufficient confidence (less than 50%),
598+
the function returns the pandas default encoding=None or ('utf-8')
599+
as a default fallback.
600+
Raises:
601+
----------
602+
OSError:
603+
If the specified file cannot be opened for reading.
604+
"""
605+
606+
try:
607+
# Open the file in binary mode to read raw bytes
608+
with open(file_path, 'rb') as f:
609+
rawdata = f.read()
610+
except OSError as e:
611+
raise OSError(f"Error opening file: {file_path}. {e}")
612+
613+
# Use chardet to analyze the byte data and detect encoding
614+
result = chardet.detect(rawdata)
615+
616+
# Check confidence level of the detection
617+
if result['confidence'] > 0.5:
618+
encoding = result['encoding']
619+
else:
620+
# Confidence level below 50%, return a safe default encoding (utf-8)
621+
encoding = None
622+
print(f"Encoding confidence for '{file_path}' is low (< 50%). Using pandas default.")
623+
624+
return encoding
625+
626+
# ----------------------------------------------------------------------------------
627+
580628
def read_spreadsheets(file_path,
581629
sheet_name=None,
582630
dtype=None,
@@ -626,20 +674,29 @@ def read_spreadsheets(file_path,
626674
filename = os.path.basename(file_path)
627675
base_name, ext = os.path.splitext(filename)
628676

677+
678+
629679
if ext in [".xlsx", ".xls"]:
630680
xls = pd.ExcelFile(file_path)
631681
df = pd.read_excel(file_path,
632682
sheet_name=sheet_name,
633683
dtype=dtype,
634684
na_values=na_values)
635685
elif ext == ".csv":
636-
df = pd.read_csv(file_path, dtype=dtype, na_values=na_values)
686+
encoding=detect_file_encoding(file_path)
687+
df = pd.read_csv(file_path,
688+
dtype=dtype,
689+
na_values=na_values,
690+
encoding=encoding)
637691
else:
638692
raise ValueError(f"Unsupported file extension: {ext}")
639693

640694
if rm_newlines:
641695
df = remove_pd_df_newlines(df, replace_char=replace_char)
642696

697+
# Use str.strip() to remove leading and trailing spaces from column names
698+
df.columns = df.columns.str.strip()
699+
643700
return df
644701

645702
# ----------------------------------------------------------------------------------
@@ -1551,7 +1608,9 @@ def write_dataframes_to_xlsx(dataframes,
15511608
index=False)
15521609
count += 1
15531610
else:
1554-
df.to_excel(writer, sheet_name=sheet_name, index=False)
1611+
df.to_excel(writer,
1612+
sheet_name=sheet_name,
1613+
index=False)
15551614

15561615
# Overwrite the file if it exists already
15571616
if os.path.exists(output_path):

0 commit comments

Comments
 (0)