Skip to content

Commit 35eeed4

Browse files
Merge pull request #330 from HXLStandard/HXL-46
HXL-46 Add info() for data formats other than XLSX and XLS
2 parents 7f6fa0d + 96ded39 commit 35eeed4

5 files changed

Lines changed: 208 additions & 79 deletions

File tree

CHANGELOG

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
Release 4.29
2+
- remove hxl.input.ExcelInput.info() and make a top-level hxl.input.info() function that works with every data type (also alias to hxl.info())
3+
14
2023-03-20 Release 4.28:
25
- update requirements to allow latest versions of dependencies
36
- don't fall back to CSV if we have a MIME type or file extension that's not in the allow list (which is fairly liberal)

hxl/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def __str__(self):
112112
import hxl.geo
113113
import hxl.datatypes
114114
from hxl.model import TagPattern, Dataset, Column, Row, RowQuery
115-
from hxl.input import data, tagger, HXLParseException, write_hxl, make_input, InputOptions, from_spec
115+
from hxl.input import data, info, tagger, HXLParseException, write_hxl, make_input, InputOptions, from_spec
116116
from hxl.validation import schema, validate, HXLValidationException
117117

118118
# end

hxl/input.py

Lines changed: 112 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,85 @@ def data(data, input_options=None):
218218

219219
return HXLReader(make_input(data, input_options))
220220

221+
222+
def info(data, input_options=None):
223+
""" Return info about a data source (rather than the data itself)
224+
225+
Top-level properties:
226+
- url_or_filename
227+
- format ("XLSX", "XLS", "CSV", "JSON", or "Arrays")
228+
- sheets
229+
230+
Per-sheet properties:
231+
- name (always "__DEFAULT__" if not XLS or XLSX)
232+
- nrows
233+
- ncols
234+
- is_hidden (always False if not XLS or XLSX)
235+
- has_merged_cells (always False if not XLSX)
236+
- is_hxlated
237+
- header_hash (hash of the first raw row)
238+
- hxl_hashtag_hash (hash of the HXL hashtag row and preceding header row, if HXLated)
239+
240+
Args:
241+
data: a HXL data provider, file object, array, or string (representing a URL or file name).
242+
input_options (InputOptions): options for reading a dataset.
243+
244+
Returns:
245+
A dict containing info about the object.
246+
247+
Raises:
248+
IOError: if there's an error loading the data.
249+
hxl.HXLException: if there's a structural error in the data.
250+
hxl.input.HXLAuthorizationException: if the source requires some kind of authorisation (possibly fixable by adding an Authorization: header to the ``http_headers`` arg.
251+
252+
"""
253+
254+
input = make_input(data, input_options)
255+
result = {
256+
"url_or_filename": input.url_or_filename,
257+
"format": input.format,
258+
}
259+
260+
if result["format"] in ("XLS", "XLSX",):
261+
# Excel metadata is special
262+
result["sheets"] = input.get_sheet_info()
263+
264+
else:
265+
# Otherwise, compute from the content
266+
267+
# iterate through the rows
268+
opening_rows = []
269+
nrows = 0
270+
ncols = 0
271+
for row in input:
272+
nrows += 1
273+
if len(row) > ncols:
274+
ncols = len(row)
275+
if nrows <= 25:
276+
opening_rows.append(row)
277+
278+
# See if the first 25 rows are HXLated
279+
try:
280+
source = HXLReader(opening_rows)
281+
hxl_hashtag_hash = source.columns_hash
282+
except HXLTagsNotFoundException:
283+
hxl_hashtag_hash = None
284+
285+
result["sheets"] = [
286+
{
287+
"name": "__DEFAULT__",
288+
"nrows": nrows,
289+
"ncols": ncols,
290+
"is_hidden": False,
291+
"has_merged_cells": False,
292+
"is_hxlated": hxl_hashtag_hash is not None,
293+
"header_hash": hash_row(opening_rows[0]) if nrows > 0 else None,
294+
"hxl_hashtag_hash": hxl_hashtag_hash,
295+
},
296+
]
297+
298+
return result
299+
221300

222301
def tagger(data, specs, input_options=None, default_tag=None, match_all=False):
223302
"""Open an untagged data source and add hashtags.
@@ -265,6 +344,7 @@ def tagger(data, specs, input_options=None, default_tag=None, match_all=False):
265344
)
266345

267346

347+
268348
def write_hxl(output, source, show_headers=True, show_tags=True):
269349
"""Serialize a HXL dataset to an output stream in CSV format.
270350
@@ -818,36 +898,12 @@ class AbstractInput(object):
818898

819899
__metaclass__ = abc.ABCMeta
820900

821-
def __init__(self, input_options):
901+
def __init__(self, input_options, url_or_filename=None):
822902
super().__init__()
823903
self.input_options = input_options
904+
self.url_or_filename = None
824905
self.is_repeatable = False
825906

826-
def info(self):
827-
""" Get information about the raw dataset.
828-
Uses low-level row-wise input, so the source doesn't have to be HXLated.
829-
830-
The result will be a dict with info about the workbook:
831-
832-
- format (e.g. "XLSX")
833-
- sheets (list)
834-
835-
The following will appear for each sheet:
836-
837-
- sheet_name (string)
838-
- is_hidden (boolean)
839-
- nrows (int)
840-
- ncols (int)
841-
- has_merged_cells (boolean)
842-
- is_hxlated (boolean)
843-
- header_hash (MD5 string)
844-
- hashtag hash (MD5 string, or null if not HXLated)
845-
846-
(Currently supported only for Excel.)
847-
848-
"""
849-
raise NotImplementedError()
850-
851907
@abc.abstractmethod
852908
def __iter__(self):
853909
return self
@@ -876,20 +932,22 @@ class CSVInput(AbstractInput):
876932
_DELIMITERS = [",", "\t", ";", ":", "|"]
877933
""" CSV delimiters allowed """
878934

879-
def __init__(self, input, input_options):
935+
def __init__(self, input, input_options, url_or_filename=None):
880936
"""
881937
Args:
882938
input (io.IOBase): a byte input stream
883939
input_options (InputOptions): options for reading a dataset.
884940
885941
"""
886-
super().__init__(input_options)
942+
super().__init__(input_options, url_or_filename)
943+
944+
self.format = "CSV"
887945

888946
# guess the delimiter
889-
delimiter = CSVInput._detect_delimiter(input, input_options.encoding or "utf-8")
947+
self.delimiter = CSVInput._detect_delimiter(input, input_options.encoding or "utf-8")
890948

891949
self._input = io.TextIOWrapper(input, encoding=input_options.encoding, errors="replace")
892-
self._reader = csv.reader(self._input, delimiter=delimiter)
950+
self._reader = csv.reader(self._input, delimiter=self.delimiter)
893951

894952
def __exit__(self, value, type, traceback):
895953
self._input.close()
@@ -964,16 +1022,17 @@ class JSONInput(AbstractInput):
9641022
9651023
"""
9661024

967-
def __init__(self, input, input_options):
1025+
def __init__(self, input, input_options, url_or_filename=None):
9681026
"""
9691027
Args:
9701028
input (io.IOBase): an input byte stream
9711029
input_options (InputOptions): options for reading a dataset.
9721030
9731031
"""
974-
super().__init__(input_options)
1032+
super().__init__(input_options, url_or_filename)
9751033

9761034
# values to be set by _scan_data_element
1035+
self.format = 'JSON'
9771036
self.type = None
9781037
self.headers = []
9791038
self.show_headers = False
@@ -1122,8 +1181,7 @@ def __init__(self, contents, input_options, url_or_filename=None):
11221181
input_options (InputOptions): options for reading a dataset.
11231182
url_or_filename (string): the original URL or filename or None
11241183
"""
1125-
super().__init__(input_options)
1126-
self.url_or_filename = url_or_filename
1184+
super().__init__(input_options, url_or_filename)
11271185
self.is_repeatable = True
11281186
self.contents = contents
11291187

@@ -1134,24 +1192,16 @@ def __init__(self, contents, input_options, url_or_filename=None):
11341192
sheet_index = self._find_hxl_sheet_index()
11351193

11361194
self._sheet = self._get_sheet(sheet_index)
1195+
1196+
self.format = "XLSX" if self._workbook.biff_version == 0 else "XLS"
1197+
11371198
self.merged_values = {}
11381199

1139-
def info (self):
1140-
""" See method doc for parent class """
1200+
def get_sheet_info (self):
1201+
""" Return sheet metadata for the top-level info() function """
1202+
1203+
result = [] # list of dicts containing info for each sheet in the workbook
11411204

1142-
def hash_headers (raw_row):
1143-
""" Create a hash just for the first row of values
1144-
"""
1145-
md5 = hashlib.md5()
1146-
for value in raw_row:
1147-
md5.update(hxl.datatypes.normalise_space(value).encode('utf-8'))
1148-
return md5.hexdigest()
1149-
1150-
result = {
1151-
"url_or_filename": self.url_or_filename,
1152-
"format": "XLSX" if self._workbook.biff_version == 0 else "XLS",
1153-
"sheets": [],
1154-
}
11551205
for sheet_index in range(0, self._workbook.nsheets):
11561206
sheet = self._get_sheet(sheet_index)
11571207
columns = self._get_columns(sheet)
@@ -1162,10 +1212,10 @@ def hash_headers (raw_row):
11621212
"ncols": sheet.ncols,
11631213
"has_merged_cells": (len(sheet.merged_cells) > 0),
11641214
"is_hxlated": (columns is not None),
1165-
"header_hash": hash_headers(self._get_row(sheet, 0)) if sheet.nrows > 0 else None,
1166-
"hashtag_hash": hxl.model.Column.hash_list(columns) if columns else None,
1215+
"header_hash": hash_row(self._get_row(sheet, 0)) if sheet.nrows > 0 else None,
1216+
"hxl_hashtag_hash": hxl.model.Column.hash_list(columns) if columns else None,
11671217
}
1168-
result["sheets"].append(sheet_info)
1218+
result.append(sheet_info)
11691219
return result
11701220

11711221
def __iter__(self):
@@ -1314,6 +1364,7 @@ def __init__(self, data):
13141364
13151365
"""
13161366
super().__init__(input_options=None)
1367+
self.format = 'Arrays'
13171368
self.data = data
13181369
self.is_repeatable = True
13191370

@@ -1834,4 +1885,13 @@ def _get_kobo_url(asset_id, url, input_options, max_export_age_seconds=14400):
18341885
time.sleep(2)
18351886

18361887

1888+
def hash_row (row):
1889+
""" Create a hash for a row of values
1890+
"""
1891+
md5 = hashlib.md5()
1892+
for value in row:
1893+
md5.update(hxl.datatypes.normalise_space(value).encode('utf-8'))
1894+
return md5.hexdigest()
1895+
1896+
18371897
# end
9 KB
Binary file not shown.

0 commit comments

Comments
 (0)