@@ -218,6 +218,85 @@ def data(data, input_options=None):
218218
219219 return HXLReader (make_input (data , input_options ))
220220
221+
222+ def info (data , input_options = None ):
223+ """ Return info about a data source (rather than the data itself)
224+
225+ Top-level properties:
226+ - url_or_filename
227+ - format ("XLSX", "XLS", "CSV", "JSON", or "Arrays")
228+ - sheets
229+
230+ Per-sheet properties:
231+ - name (always "__DEFAULT__" if not XLS or XLSX)
232+ - nrows
233+ - ncols
234+ - is_hidden (always False if not XLS or XLSX)
235+ - has_merged_cells (always False if not XLSX)
236+ - is_hxlated
237+ - header_hash (hash of the first raw row)
238+ - hxl_hashtag_hash (hash of the HXL hashtag row and preceding header row, if HXLated)
239+
240+ Args:
241+ data: a HXL data provider, file object, array, or string (representing a URL or file name).
242+ input_options (InputOptions): options for reading a dataset.
243+
244+ Returns:
245+ A dict containing info about the object.
246+
247+ Raises:
248+ IOError: if there's an error loading the data.
249+ hxl.HXLException: if there's a structural error in the data.
250+ hxl.input.HXLAuthorizationException: if the source requires some kind of authorisation (possibly fixable by adding an Authorization: header to the ``http_headers`` arg.
251+
252+ """
253+
254+ input = make_input (data , input_options )
255+ result = {
256+ "url_or_filename" : input .url_or_filename ,
257+ "format" : input .format ,
258+ }
259+
260+ if result ["format" ] in ("XLS" , "XLSX" ,):
261+ # Excel metadata is special
262+ result ["sheets" ] = input .get_sheet_info ()
263+
264+ else :
265+ # Otherwise, compute from the content
266+
267+ # iterate through the rows
268+ opening_rows = []
269+ nrows = 0
270+ ncols = 0
271+ for row in input :
272+ nrows += 1
273+ if len (row ) > ncols :
274+ ncols = len (row )
275+ if nrows <= 25 :
276+ opening_rows .append (row )
277+
278+ # See if the first 25 rows are HXLated
279+ try :
280+ source = HXLReader (opening_rows )
281+ hxl_hashtag_hash = source .columns_hash
282+ except HXLTagsNotFoundException :
283+ hxl_hashtag_hash = None
284+
285+ result ["sheets" ] = [
286+ {
287+ "name" : "__DEFAULT__" ,
288+ "nrows" : nrows ,
289+ "ncols" : ncols ,
290+ "is_hidden" : False ,
291+ "has_merged_cells" : False ,
292+ "is_hxlated" : hxl_hashtag_hash is not None ,
293+ "header_hash" : hash_row (opening_rows [0 ]) if nrows > 0 else None ,
294+ "hxl_hashtag_hash" : hxl_hashtag_hash ,
295+ },
296+ ]
297+
298+ return result
299+
221300
222301def tagger (data , specs , input_options = None , default_tag = None , match_all = False ):
223302 """Open an untagged data source and add hashtags.
@@ -265,6 +344,7 @@ def tagger(data, specs, input_options=None, default_tag=None, match_all=False):
265344 )
266345
267346
347+
268348def write_hxl (output , source , show_headers = True , show_tags = True ):
269349 """Serialize a HXL dataset to an output stream in CSV format.
270350
@@ -818,36 +898,12 @@ class AbstractInput(object):
818898
819899 __metaclass__ = abc .ABCMeta
820900
821- def __init__ (self , input_options ):
901+ def __init__ (self , input_options , url_or_filename = None ):
822902 super ().__init__ ()
823903 self .input_options = input_options
904+ self .url_or_filename = None
824905 self .is_repeatable = False
825906
826- def info (self ):
827- """ Get information about the raw dataset.
828- Uses low-level row-wise input, so the source doesn't have to be HXLated.
829-
830- The result will be a dict with info about the workbook:
831-
832- - format (e.g. "XLSX")
833- - sheets (list)
834-
835- The following will appear for each sheet:
836-
837- - sheet_name (string)
838- - is_hidden (boolean)
839- - nrows (int)
840- - ncols (int)
841- - has_merged_cells (boolean)
842- - is_hxlated (boolean)
843- - header_hash (MD5 string)
844- - hashtag hash (MD5 string, or null if not HXLated)
845-
846- (Currently supported only for Excel.)
847-
848- """
849- raise NotImplementedError ()
850-
851907 @abc .abstractmethod
852908 def __iter__ (self ):
853909 return self
@@ -876,20 +932,22 @@ class CSVInput(AbstractInput):
876932 _DELIMITERS = ["," , "\t " , ";" , ":" , "|" ]
877933 """ CSV delimiters allowed """
878934
879- def __init__ (self , input , input_options ):
935+ def __init__ (self , input , input_options , url_or_filename = None ):
880936 """
881937 Args:
882938 input (io.IOBase): a byte input stream
883939 input_options (InputOptions): options for reading a dataset.
884940
885941 """
886- super ().__init__ (input_options )
942+ super ().__init__ (input_options , url_or_filename )
943+
944+ self .format = "CSV"
887945
888946 # guess the delimiter
889- delimiter = CSVInput ._detect_delimiter (input , input_options .encoding or "utf-8" )
947+ self . delimiter = CSVInput ._detect_delimiter (input , input_options .encoding or "utf-8" )
890948
891949 self ._input = io .TextIOWrapper (input , encoding = input_options .encoding , errors = "replace" )
892- self ._reader = csv .reader (self ._input , delimiter = delimiter )
950+ self ._reader = csv .reader (self ._input , delimiter = self . delimiter )
893951
894952 def __exit__ (self , value , type , traceback ):
895953 self ._input .close ()
@@ -964,16 +1022,17 @@ class JSONInput(AbstractInput):
9641022
9651023 """
9661024
967- def __init__ (self , input , input_options ):
1025+ def __init__ (self , input , input_options , url_or_filename = None ):
9681026 """
9691027 Args:
9701028 input (io.IOBase): an input byte stream
9711029 input_options (InputOptions): options for reading a dataset.
9721030
9731031 """
974- super ().__init__ (input_options )
1032+ super ().__init__ (input_options , url_or_filename )
9751033
9761034 # values to be set by _scan_data_element
1035+ self .format = 'JSON'
9771036 self .type = None
9781037 self .headers = []
9791038 self .show_headers = False
@@ -1122,8 +1181,7 @@ def __init__(self, contents, input_options, url_or_filename=None):
11221181 input_options (InputOptions): options for reading a dataset.
11231182 url_or_filename (string): the original URL or filename or None
11241183 """
1125- super ().__init__ (input_options )
1126- self .url_or_filename = url_or_filename
1184+ super ().__init__ (input_options , url_or_filename )
11271185 self .is_repeatable = True
11281186 self .contents = contents
11291187
@@ -1134,24 +1192,16 @@ def __init__(self, contents, input_options, url_or_filename=None):
11341192 sheet_index = self ._find_hxl_sheet_index ()
11351193
11361194 self ._sheet = self ._get_sheet (sheet_index )
1195+
1196+ self .format = "XLSX" if self ._workbook .biff_version == 0 else "XLS"
1197+
11371198 self .merged_values = {}
11381199
1139- def info (self ):
1140- """ See method doc for parent class """
1200+ def get_sheet_info (self ):
1201+ """ Return sheet metadata for the top-level info() function """
1202+
1203+ result = [] # list of dicts containing info for each sheet in the workbook
11411204
1142- def hash_headers (raw_row ):
1143- """ Create a hash just for the first row of values
1144- """
1145- md5 = hashlib .md5 ()
1146- for value in raw_row :
1147- md5 .update (hxl .datatypes .normalise_space (value ).encode ('utf-8' ))
1148- return md5 .hexdigest ()
1149-
1150- result = {
1151- "url_or_filename" : self .url_or_filename ,
1152- "format" : "XLSX" if self ._workbook .biff_version == 0 else "XLS" ,
1153- "sheets" : [],
1154- }
11551205 for sheet_index in range (0 , self ._workbook .nsheets ):
11561206 sheet = self ._get_sheet (sheet_index )
11571207 columns = self ._get_columns (sheet )
@@ -1162,10 +1212,10 @@ def hash_headers (raw_row):
11621212 "ncols" : sheet .ncols ,
11631213 "has_merged_cells" : (len (sheet .merged_cells ) > 0 ),
11641214 "is_hxlated" : (columns is not None ),
1165- "header_hash" : hash_headers (self ._get_row (sheet , 0 )) if sheet .nrows > 0 else None ,
1166- "hashtag_hash " : hxl .model .Column .hash_list (columns ) if columns else None ,
1215+ "header_hash" : hash_row (self ._get_row (sheet , 0 )) if sheet .nrows > 0 else None ,
1216+ "hxl_hashtag_hash " : hxl .model .Column .hash_list (columns ) if columns else None ,
11671217 }
1168- result [ "sheets" ] .append (sheet_info )
1218+ result .append (sheet_info )
11691219 return result
11701220
11711221 def __iter__ (self ):
@@ -1314,6 +1364,7 @@ def __init__(self, data):
13141364
13151365 """
13161366 super ().__init__ (input_options = None )
1367+ self .format = 'Arrays'
13171368 self .data = data
13181369 self .is_repeatable = True
13191370
@@ -1834,4 +1885,13 @@ def _get_kobo_url(asset_id, url, input_options, max_export_age_seconds=14400):
18341885 time .sleep (2 )
18351886
18361887
1888+ def hash_row (row ):
1889+ """ Create a hash for a row of values
1890+ """
1891+ md5 = hashlib .md5 ()
1892+ for value in row :
1893+ md5 .update (hxl .datatypes .normalise_space (value ).encode ('utf-8' ))
1894+ return md5 .hexdigest ()
1895+
1896+
18371897# end
0 commit comments