DL4CV_final/dataset_preprocess.py at main · michaleldar/DL4CV_final · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import cv2
import numpy as np
import os
import glob
import pandas as pd
from datetime import datetime
import sys
sys.path.extend(['/home/michalel/PycharmProjects/LabData'])
sys.path.extend(['/home/michalel/PycharmProjects/LabQueue'])
sys.path.extend(['/home/michalel/PycharmProjects/LabUtils'])
from LabData.DataLoaders import BodyMeasuresLoader, SubjectLoader, UltrasoundLoader

class Dataset:
    def _init_(self):
        pass

    def extract_patch(self, image, top_left, patch_size):
        x, y = top_left
        h, w = patch_size
        return image[y:y+h, x:x+w]

    def find_images_with_patch(self, directory, reference_patch, patch_top_left, threshold=0.9):
        reference_patch_gray = cv2.cvtColor(reference_patch, cv2.COLOR_BGR2GRAY)
        matching_images = []

        for filename in os.listdir(directory):
            if filename.lower().endswith('.jpg'):
                image_path = os.path.join(directory, filename)
                image = cv2.imread(image_path)
                patch = self.extract_patch(image, patch_top_left, reference_patch.shape[:2])
                patch_gray = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY)
                result = cv2.matchTemplate(patch_gray, reference_patch_gray, cv2.TM_CCOEFF_NORMED)
                if np.max(result) >= threshold:
                    matching_images.append(image_path)
        return matching_images

    def filter_us(self, search_directory='/net/mraid20/export/genie/LabData/Data/10K/aws_lab_files/ultrasound/jpg/1001201093/00_00_visit/20210826/'):
        patch_size, patch_top_left, reference_image = self.us_patch_id()
        reference_patch = self.extract_patch(reference_image, patch_top_left, patch_size)
        matching_images = self.find_images_with_patch(search_directory, reference_patch, patch_top_left)
        return matching_images

    def us_patch_id(self):
        reference_image_path = '/net/mraid20/export/genie/LabData/Data/10K/aws_lab_files/ultrasound/jpg/1001201093/00_00_visit/20210826/103932.jpg'
        patch_top_left = (1130, 925)
        patch_size = (300, 155)
        reference_image = cv2.imread(reference_image_path)
        return patch_size, patch_top_left, reference_image

    @staticmethod
    def _convert_to_date(date):
        if "_" in date:
            date = date.replace("_", "")
        return datetime.strptime(date, '%Y%m%d').date()

    @staticmethod
    def keep_first_date(df):
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.sort_values(by='Date', ascending=True)
        df = df[~df.index.duplicated(keep='first')]
        return df

    def create_dataset(self, run_map_images=False):
        print('Creating dataset...')
        subject_data = SubjectLoader.SubjectLoader().get_data(study_ids=['10K']).df.reset_index(level=[1])
        measures_data = BodyMeasuresLoader.BodyMeasuresLoader().get_data(study_ids=['10K']).df.reset_index(level=[1])
        ultrasounds_data = UltrasoundLoader.UltrasoundLoader().get_data(study_ids=['10K']).df.reset_index(level=[1])
        subject_data['Date'] = subject_data['Date'].dt.date
        measures_data['Date'] = measures_data['Date'].dt.date
        ultrasounds_data['Date'] = ultrasounds_data['Date'].dt.date

        subject_data = self.keep_first_date(subject_data[~subject_data.index.duplicated(keep='first')])
        measures_data = self.keep_first_date(measures_data[~measures_data.index.duplicated(keep='first')])
        ultrasounds_data = self.keep_first_date(ultrasounds_data[~ultrasounds_data.index.duplicated(keep='first')])
        df = pd.merge(subject_data, measures_data, left_index=True, right_index=True)
        df = pd.merge(df, ultrasounds_data, left_index=True, right_index=True)
        nafl = pd.read_csv("/net/mraid20/export/genie/LabData/Data/10K/for_review/baseline_conditions_all.csv",
                           index_col=0)
        nafl2 = pd.read_csv("/net/mraid20/export/genie/LabData/Data/10K/for_review/follow_up_conditions_all.csv",
                            index_col=0)
        print (nafl.columns)

        nafl = pd.concat([nafl, nafl2])[["medical_condition"]]
        print (nafl.columns)
        df = pd.merge(df, nafl, left_index=True, right_index=True)

        # print number of rows in df
        print('Number of rows in df:', df.shape[0])
        if run_map_images:
            self.map_subject_to_images()
        images = np.load('/home/michalel/PycharmProjects/basic/map_sbj_to_img.npy', allow_pickle='TRUE').item()
        images = pd.DataFrame.from_dict(images, orient='index')
        images = images.rename(columns={0: 'image_path'})
        # print the index of images, and index of df
        print('Index of images:', images.index[:5])
        print('Index of df:', df.index[:5])
        df = df.merge(images, left_index=True, right_index=True)
        print('Number of rows in df:', df.shape[0])
        df['liver_attenuation'] = df.filter(regex='att_plus_ssp_plus_db_cm_mhz').mean(axis=1)
        df['liver_sound_speed'] = df.filter(regex='att_plus_ssp_plus_m_s').mean(axis=1)
        print('Number of rows in df:', df.shape[0])

        return df

    def map_subject_to_images(self, subjects_path_rgx='/net/mraid20/export/genie/LabData/Data/10K/aws_lab_files/ultrasound/jpg/*/00_00_visit/*', path_to_save='/home/michalel/PycharmProjects/basic/map_sbj_to_img.npy'):
        print ('Mapping subject to images...')
        dataset = {}
        count = 0
        for directory in glob.glob(subjects_path_rgx):
            count += 1
            matching_images = self.filter_us(directory)
            key = "10K_" + directory.split('jpg/')[-1].split('/')[0]
            date = self._convert_to_date(directory.split('jpg/')[-1].split('/')[2])
            dataset[key] = matching_images
            if count % 100 == 0:
                print(f'{count} subjects processed')
        np.save(path_to_save, dataset)
        print('Mapping done')


if __name__ == '__main__':
    dataset = Dataset()
    # result = dataset.create_dataset()