diff --git a/Orange/OrangeWidgets/Prototypes/OWCSVFileImport.py b/Orange/OrangeWidgets/Prototypes/OWCSVFileImport.py index 274f43d09..d0e68d7a9 100644 --- a/Orange/OrangeWidgets/Prototypes/OWCSVFileImport.py +++ b/Orange/OrangeWidgets/Prototypes/OWCSVFileImport.py @@ -1,5 +1,6 @@ """ CSV File import +icons/FileCSV.png Import comma separated file """ @@ -49,7 +50,7 @@ def reload_icon(self): class OWCSVFileImport(OWWidget): - settingsList = ["recent_files", "hints"] + settingsList = ["recent_files", "hints","ignore_first_lines"] DELIMITERS = [("Tab", "\t"), ("Comma", ","), @@ -75,6 +76,8 @@ def __init__(self, parent=None, signalManager=None, self.skipinitialspace = True self.has_header = True self.has_orange_header = True + self.ignore_first_lines = 0 #3 + self.add_simple_orange_header = False # # List of recent opened files. self.recent_files = [] @@ -190,6 +193,12 @@ def __init__(self, parent=None, signalManager=None, form.addRow(self.skipinitialspace_check) + self.spin_sk_ln= OWGUI.spin(box, self, "ignore_first_lines", label="Skip first lines", # !!!!!!!!!!!!!!!!!!!!!!!!!!!!! + min=0, max=1000, step=1, + callback=self.ignore_first_lines_changed, + controlWidth=40, + keyboardTracking=False) + self.has_header_check = \ QCheckBox(objectName="has_header_check", checked=self.has_header, @@ -254,6 +263,16 @@ def quote_changed(self): self.quote = str(self.quote_edit.text()) self.update_preview() + def ignore_first_lines_changed(self): # !!!!!!!!!!!!!!!!!! + #self.ignore_first_lines = self.spin_sk_ln.value() + if self.selected_file: + with open(self.selected_file, "rU") as f: + self.skipinitiallines(f) + self.selected_file_head=[] + for i, line in zip(range(30), f): + self.selected_file_head.append(line) + self.update_preview() + def missing_changed(self): self.missing = str(self.missing_edit.text()) self.update_preview() @@ -270,6 +289,20 @@ def skipinitialspace_changed(self): self.skipinitialspace = self.skipinitialspace_check.isChecked() self.update_preview() + def skipinitiallines(self,file): # !!!!!!!!!!!!!! + ignore=self.ignore_first_lines + while ignore and file.readline(): + ignore-=1 + + def open_and_skiplines(self,file, mode="rb"): # !!!!!!!!!!!!!! + if isinstance(file, basestring): + file = open(file, mode) + else: # assuming it is file like with proper mode, could check for write, read + pass + self.skipinitiallines(file) + return file + + def set_selected_file(self, filename): basedir, name = os.path.split(filename) index_to_remove = None @@ -291,7 +324,7 @@ def set_selected_file(self, filename): hints = self.hints[filename] else: try: - hints = sniff_csv(filename) + hints = self.sniff_csv(filename) except csv.Error, ex: self.warning(1, str(ex)) hints = dict(DEFAULT_HINTS) @@ -338,6 +371,7 @@ def set_selected_file(self, filename): self.selected_file = filename self.selected_file_head = [] with open(self.selected_file, "rU") as f: + self.skipinitiallines(f) for i, line in zip(range(30), f): self.selected_file_head.append(line) @@ -357,7 +391,7 @@ def update_preview(self): hints["skipinitialspace"] = self.skipinitialspace hints["DK"] = self.missing or None try: - data = Orange.data.io.load_csv(head, delimiter=self.delimiter, + data = Orange.data.io.load_csv(head, delimiter=self.delimiter, quotechar=self.quote, has_header=self.has_header, has_types=self.has_orange_header, @@ -379,7 +413,9 @@ def send_data(self): self.error(0) if self.selected_file: try: - data = Orange.data.io.load_csv(self.selected_file, + with open(self.selected_file, "rb") as f: + self.skipinitiallines(f) + data = Orange.data.io.load_csv(f, delimiter=self.delimiter, quotechar=self.quote, has_header=self.has_header, @@ -397,26 +433,28 @@ def send_data(self): self.send("Data", self.data) -def sniff_csv(file): - snifer = csv.Sniffer() - if isinstance(file, basestring): - file = open(file, "rU") - - sample = file.read(2 ** 20) # max 1MB sample - dialect = snifer.sniff(sample) - has_header = snifer.has_header(sample) - - return {"delimiter": dialect.delimiter, - "doublequote": dialect.doublequote, - "escapechar": dialect.escapechar, - "quotechar": dialect.quotechar, - "quoting": dialect.quoting, - "skipinitialspace": dialect.skipinitialspace, - "has_header": has_header, - "has_orange_header": False, - "skipinitialspace": True, - "DK": None, - } + def sniff_csv(self,file): + snifer = csv.Sniffer() + if isinstance(file, basestring): + with open(file, "rb") as f: + self.skipinitiallines(f) + sample = f.read(2 ** 20) # max 1MB sample self opened file + else: + sample = file.read(2 ** 20) # max 1MB sample + dialect = snifer.sniff(sample) + has_header = snifer.has_header(sample) + + return {"delimiter": dialect.delimiter, + "doublequote": dialect.doublequote, + "escapechar": dialect.escapechar, + "quotechar": dialect.quotechar, + "quoting": dialect.quoting, + "skipinitialspace": dialect.skipinitialspace, + "has_header": has_header, + "has_orange_header": False, + "skipinitialspace": True, + "DK": None, + } if __name__ == "__main__": import sys diff --git a/Orange/OrangeWidgets/Prototypes/icons/FileCSV.png b/Orange/OrangeWidgets/Prototypes/icons/FileCSV.png new file mode 100644 index 000000000..cabb4cc0e Binary files /dev/null and b/Orange/OrangeWidgets/Prototypes/icons/FileCSV.png differ diff --git a/Orange/data/io.py b/Orange/data/io.py index 72424b7b5..d385d85e6 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -630,6 +630,7 @@ def load_csv(file, create_new_on=MakeStatus.Incompatible, """Load an Orange.data.Table from a csv file.""" file = as_open_file(file, "rU") + start=file.tell() snifer = csv.Sniffer() # Max 5MB sample @@ -647,7 +648,7 @@ def load_csv(file, create_new_on=MakeStatus.Incompatible, except csv.Error: has_header = False - file.seek(0) # Rewind + file.seek(start) # Rewind def kwparams(**kwargs): """Return not None kwargs. @@ -724,7 +725,7 @@ def kwparams(**kwargs): var_attrs += [None] * (len(header) - len(var_attrs)) # start from the beginning - file.seek(0) + file.seek(start) reader = csv.reader(file, dialect=dialect, **fmtparam) for defined in [has_header, has_types, has_annotations]: