|
4 | 4 | import ast |
5 | 5 | import math |
6 | 6 | import hashlib |
| 7 | +import chardet |
7 | 8 | import re |
8 | 9 | import warnings |
9 | 10 | from datetime import datetime |
@@ -577,6 +578,53 @@ def infer_datetime_column(df, |
577 | 578 |
|
578 | 579 | # ---------------------------------------------------------------------------------- |
579 | 580 |
|
| 581 | +def detect_file_encoding(file_path): |
| 582 | + """Detects the character encoding of a text-based file using chardet library. |
| 583 | +
|
| 584 | + This function is useful for determining the appropriate encoding when reading |
| 585 | + files that may not explicitly declare their encoding. It analyzes a sample |
| 586 | + of the file's content to identify the most likely character encoding scheme |
| 587 | + used. |
| 588 | +
|
| 589 | + Parameters: |
| 590 | + ---------- |
| 591 | + file_path (str): |
| 592 | + The path to the target file. |
| 593 | + Returns: |
| 594 | + ---------- |
| 595 | + str: |
| 596 | + The detected character encoding of the file. If chardet cannot |
| 597 | + determine the encoding with sufficient confidence (less than 50%), |
| 598 | + the function returns the pandas default encoding=None or ('utf-8') |
| 599 | + as a default fallback. |
| 600 | + Raises: |
| 601 | + ---------- |
| 602 | + OSError: |
| 603 | + If the specified file cannot be opened for reading. |
| 604 | + """ |
| 605 | + |
| 606 | + try: |
| 607 | + # Open the file in binary mode to read raw bytes |
| 608 | + with open(file_path, 'rb') as f: |
| 609 | + rawdata = f.read() |
| 610 | + except OSError as e: |
| 611 | + raise OSError(f"Error opening file: {file_path}. {e}") |
| 612 | + |
| 613 | + # Use chardet to analyze the byte data and detect encoding |
| 614 | + result = chardet.detect(rawdata) |
| 615 | + |
| 616 | + # Check confidence level of the detection |
| 617 | + if result['confidence'] > 0.5: |
| 618 | + encoding = result['encoding'] |
| 619 | + else: |
| 620 | + # Confidence level below 50%, return a safe default encoding (utf-8) |
| 621 | + encoding = None |
| 622 | + print(f"Encoding confidence for '{file_path}' is low (< 50%). Using pandas default.") |
| 623 | + |
| 624 | + return encoding |
| 625 | + |
| 626 | +# ---------------------------------------------------------------------------------- |
| 627 | + |
580 | 628 | def read_spreadsheets(file_path, |
581 | 629 | sheet_name=None, |
582 | 630 | dtype=None, |
@@ -626,20 +674,29 @@ def read_spreadsheets(file_path, |
626 | 674 | filename = os.path.basename(file_path) |
627 | 675 | base_name, ext = os.path.splitext(filename) |
628 | 676 |
|
| 677 | + |
| 678 | + |
629 | 679 | if ext in [".xlsx", ".xls"]: |
630 | 680 | xls = pd.ExcelFile(file_path) |
631 | 681 | df = pd.read_excel(file_path, |
632 | 682 | sheet_name=sheet_name, |
633 | 683 | dtype=dtype, |
634 | 684 | na_values=na_values) |
635 | 685 | elif ext == ".csv": |
636 | | - df = pd.read_csv(file_path, dtype=dtype, na_values=na_values) |
| 686 | + encoding=detect_file_encoding(file_path) |
| 687 | + df = pd.read_csv(file_path, |
| 688 | + dtype=dtype, |
| 689 | + na_values=na_values, |
| 690 | + encoding=encoding) |
637 | 691 | else: |
638 | 692 | raise ValueError(f"Unsupported file extension: {ext}") |
639 | 693 |
|
640 | 694 | if rm_newlines: |
641 | 695 | df = remove_pd_df_newlines(df, replace_char=replace_char) |
642 | 696 |
|
| 697 | + # Use str.strip() to remove leading and trailing spaces from column names |
| 698 | + df.columns = df.columns.str.strip() |
| 699 | + |
643 | 700 | return df |
644 | 701 |
|
645 | 702 | # ---------------------------------------------------------------------------------- |
@@ -1551,7 +1608,9 @@ def write_dataframes_to_xlsx(dataframes, |
1551 | 1608 | index=False) |
1552 | 1609 | count += 1 |
1553 | 1610 | else: |
1554 | | - df.to_excel(writer, sheet_name=sheet_name, index=False) |
| 1611 | + df.to_excel(writer, |
| 1612 | + sheet_name=sheet_name, |
| 1613 | + index=False) |
1555 | 1614 |
|
1556 | 1615 | # Overwrite the file if it exists already |
1557 | 1616 | if os.path.exists(output_path): |
|
0 commit comments