Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/midrc_react/core/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ def _adjust_outliers(df: pd.DataFrame, cut_column_name: str, column_name: str, b
Returns:
pd.DataFrame: DataFrame with the outliers adjusted in the cut column
"""
new_text = "Outlier"
low_text = new_text + "_Low"
high_text = new_text + "_High"
new_text = "Not Reported"
low_text = "Outlier_Low"
high_text = "Outlier_High"
print(f"WARNING: There are values outside the bins specified for the '{column_name}' column.")
df.loc[df[cut_column_name].isna() & (df[column_name] < bins[0]), cut_column_name] = low_text
df.loc[df[cut_column_name].isna() & (df[column_name] >= bins[-1]), cut_column_name] = high_text
Expand Down
2 changes: 1 addition & 1 deletion src/midrc_react/core/excel_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def build_data_frames_from_csv(self, filename: str):
None
"""
delimiter = ',' if filename.endswith('.csv') else '\t'
df = pd.read_csv(filename, delimiter=delimiter)
df = pd.read_csv(filename, delimiter=delimiter, low_memory=False)

# Apply preprocessing if a plugin is available
if self.preprocessor:
Expand Down
6 changes: 3 additions & 3 deletions src/midrc_react/plugins/midrc_tsv_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def combine_race_ethnicity(df):
def classify(row):
race, ethnicity = row['race'], row['ethnicity']

if race == 'Not Reported' or ethnicity == 'Not Reported':
if race == 'Not Reported' or ethnicity == 'Not Reported' or pd.isna(race) or pd.isna(ethnicity):
return 'Not Reported'
if ethnicity == 'Hispanic or Latino':
return ethnicity
Expand Down Expand Up @@ -103,14 +103,14 @@ def process_dataframe(df):

def process_tsv_to_tsv(input_file, output_file):
"""Reads a TSV file, processes it, and writes back to a new TSV file."""
df = pd.read_csv(input_file, sep='\t')
df = pd.read_csv(input_file, sep='\t', low_memory=False)
df = process_dataframe(df)
df.to_csv(output_file, sep='\t', index=False)


def process_tsv_to_dataframe(input_file):
"""Reads a TSV file, processes it, and returns a pandas DataFrame."""
df = pd.read_csv(input_file, sep='\t')
df = pd.read_csv(input_file, sep='\t', low_memory=False)
return process_dataframe(df)


Expand Down