1717import duckdb
1818
1919from digital_land .package .organisation import OrganisationPackage
20- from digital_land .check import duplicate_reference_check
2120from digital_land .specification import Specification
2221from digital_land .collect import Collector
2322from digital_land .collection import Collection , resource_path
3231
3332from digital_land .package .dataset import DatasetPackage
3433from digital_land .package .dataset_parquet import DatasetParquetPackage
35- from digital_land .phase .combine import FactCombinePhase
3634from digital_land .phase .concat import ConcatFieldPhase
3735from digital_land .phase .convert import ConvertPhase , execute
3836from digital_land .phase .default import DefaultPhase
3937from digital_land .phase .dump import DumpPhase
40- from digital_land .phase .factor import FactorPhase
4138from digital_land .phase .filter import FilterPhase
4239from digital_land .phase .harmonise import HarmonisePhase
4340from digital_land .phase .lookup import (
44- EntityLookupPhase ,
45- FactLookupPhase ,
4641 PrintLookupPhase ,
4742)
4843from digital_land .phase .map import MapPhase
5146from digital_land .phase .organisation import OrganisationPhase
5247from digital_land .phase .parse import ParsePhase
5348from digital_land .phase .patch import PatchPhase
54- from digital_land .phase .priority import PriorityPhase
55- from digital_land .phase .pivot import PivotPhase
5649from digital_land .phase .prefix import EntityPrefixPhase
57- from digital_land .phase .prune import FieldPrunePhase , EntityPrunePhase , FactPrunePhase
58- from digital_land .phase .reference import EntityReferencePhase , FactReferencePhase
59- from digital_land .phase .save import SavePhase
50+ from digital_land .phase .prune import FieldPrunePhase
51+ from digital_land .phase .reference import EntityReferencePhase
6052from digital_land .pipeline import run_pipeline , Lookups , Pipeline
6153from digital_land .pipeline .process import convert_tranformed_csv_to_pq
6254from digital_land .schema import Schema
@@ -220,7 +212,6 @@ def pipeline_run(
220212 input_path ,
221213 output_path : Path ,
222214 collection_dir , # TBD: remove, replaced by endpoints, organisations and entry_date
223- null_path = None , # TBD: remove this
224215 issue_dir = None ,
225216 operational_issue_dir = "performance/operational_issue/" ,
226217 organisation_path = None ,
@@ -244,37 +235,18 @@ def pipeline_run(
244235 if resource is None :
245236 resource = resource_from_path (input_path )
246237 dataset = dataset
247- schema = specification .pipeline [pipeline .name ]["schema" ]
248- intermediate_fieldnames = specification .intermediate_fieldnames (pipeline )
249- issue_log = IssueLog (dataset = dataset , resource = resource )
250- operational_issue_log = OperationalIssueLog (dataset = dataset , resource = resource )
251- column_field_log = ColumnFieldLog (dataset = dataset , resource = resource )
252- dataset_resource_log = DatasetResourceLog (dataset = dataset , resource = resource )
253- converted_resource_log = ConvertedResourceLog (dataset = dataset , resource = resource )
254- api = API (specification = specification )
255- entity_range_min = specification .get_dataset_entity_min (dataset )
256- entity_range_max = specification .get_dataset_entity_max (dataset )
257238
258- # load pipeline configuration
259- skip_patterns = pipeline .skip_patterns (resource , endpoints )
260- columns = pipeline .columns (resource , endpoints = endpoints )
261- concats = pipeline .concatenations (resource , endpoints = endpoints )
262- patches = pipeline .patches (resource = resource , endpoints = endpoints )
263- lookups = pipeline .lookups (resource = resource )
264- default_fields = pipeline .default_fields (resource = resource , endpoints = endpoints )
265- default_values = pipeline .default_values (endpoints = endpoints )
266- combine_fields = pipeline .combine_fields (endpoints = endpoints )
267- redirect_lookups = pipeline .redirect_lookups ()
268-
269- # load config db
270- # TODO get more information from the config
239+ # load config db and pass to Pipeline=> TODO move to pipeline class and use more widely
271240 # TODO in future we need better way of making specification optional for config
272241 if Path (config_path ).exists ():
273242 config = Config (path = config_path , specification = specification )
274243 else :
275- logging .error ("Config path does not exist" )
244+ logging .error ("Config path does not exist" )
276245 config = None
277246
247+ pipeline .config = config
248+ pipeline .specification = specification
249+
278250 # load organisations
279251 organisation = Organisation (
280252 organisation_path = organisation_path , pipeline_dir = Path (pipeline .path )
@@ -288,116 +260,40 @@ def pipeline_run(
288260 organisations = collection .resource_organisations (resource )
289261 entry_date = collection .resource_start_date (resource )
290262
291- # Load valid category values
263+ api = API ( specification = specification )
292264 valid_category_values = api .get_valid_category_values (dataset , pipeline )
293265
294- # resource specific default values
295- if len (organisations ) == 1 :
296- default_values ["organisation" ] = organisations [0 ]
297-
298- # need an entry-date for all entries and for facts
299- # if a default entry-date isn't set through config then use the entry-date passed
300- # to this function
301- if entry_date :
302- if "entry-date" not in default_values :
303- default_values ["entry-date" ] = entry_date
304-
305- # TODO Migrate all of this into a function in the Pipeline function
306- run_pipeline (
307- ConvertPhase (
308- path = input_path ,
309- dataset_resource_log = dataset_resource_log ,
310- converted_resource_log = converted_resource_log ,
311- output_path = converted_path ,
312- ),
313- NormalisePhase (skip_patterns = skip_patterns ),
314- ParsePhase (),
315- ConcatFieldPhase (concats = concats , log = column_field_log ),
316- FilterPhase (filters = pipeline .filters (resource )),
317- MapPhase (
318- fieldnames = intermediate_fieldnames ,
319- columns = columns ,
320- log = column_field_log ,
321- ),
322- FilterPhase (filters = pipeline .filters (resource , endpoints = endpoints )),
323- PatchPhase (
324- issues = issue_log ,
325- patches = patches ,
326- ),
327- HarmonisePhase (
328- field_datatype_map = specification .get_field_datatype_map (),
329- issues = issue_log ,
330- dataset = dataset ,
331- valid_category_values = valid_category_values ,
332- ),
333- DefaultPhase (
334- default_fields = default_fields ,
335- default_values = default_values ,
336- issues = issue_log ,
337- ),
338- # TBD: move migrating columns to fields to be immediately after map
339- # this will simplify harmonisation and remove intermediate_fieldnames
340- # but effects brownfield-land and other pipelines which operate on columns
341- MigratePhase (
342- fields = specification .schema_field [schema ],
343- migrations = pipeline .migrations (),
344- ),
345- OrganisationPhase (organisation = organisation , issues = issue_log ),
346- FieldPrunePhase (fields = specification .current_fieldnames (schema )),
347- EntityReferencePhase (
348- dataset = dataset ,
349- prefix = specification .dataset_prefix (dataset ),
350- issues = issue_log ,
351- ),
352- EntityPrefixPhase (dataset = dataset ),
353- EntityLookupPhase (
354- lookups = lookups ,
355- redirect_lookups = redirect_lookups ,
356- issue_log = issue_log ,
357- operational_issue_log = operational_issue_log ,
358- entity_range = [entity_range_min , entity_range_max ],
359- ),
360- SavePhase (
361- default_output_path ("harmonised" , input_path ),
362- fieldnames = intermediate_fieldnames ,
363- enabled = save_harmonised ,
364- ),
365- EntityPrunePhase (dataset_resource_log = dataset_resource_log ),
366- PriorityPhase (config = config , providers = organisations ),
367- PivotPhase (),
368- FactCombinePhase (issue_log = issue_log , fields = combine_fields ),
369- FactorPhase (),
370- FactReferencePhase (
371- field_typology_map = specification .get_field_typology_map (),
372- field_prefix_map = specification .get_field_prefix_map (),
373- ),
374- FactLookupPhase (
375- lookups = lookups ,
376- redirect_lookups = redirect_lookups ,
377- issue_log = issue_log ,
378- odp_collections = specification .get_odp_collections (),
379- ),
380- FactPrunePhase (),
381- SavePhase (
382- output_path ,
383- fieldnames = specification .factor_fieldnames (),
384- ),
266+ # Transform the resource
267+ issue_log = pipeline .transform (
268+ input_path = input_path ,
269+ output_path = output_path ,
270+ organisation = organisation ,
271+ endpoints = endpoints ,
272+ organisations = organisations ,
273+ entry_date = entry_date ,
274+ resource = resource ,
275+ converted_path = converted_path ,
276+ harmonised_output_path = default_output_path ("harmonised" , input_path ),
277+ save_harmonised = save_harmonised ,
278+ valid_category_values = valid_category_values ,
385279 )
386280
387- # In the FactCombinePhase, when combine_fields has some values, we check for duplicates and combine values.
388- # If we have done this then we will not call duplicate_reference_check as we have already carried out a
389- # duplicate check and stop messages appearing in issues about reference values not being unique
390- if combine_fields == {}:
391- issue_log = duplicate_reference_check (issues = issue_log , csv_path = output_path )
281+ # Save logs in pipeline
282+ pipeline .save_logs (
283+ issue_path = os .path .join (issue_dir , resource + ".csv" ),
284+ operational_issue_path = os .path .join (operational_issue_dir , resource + ".csv" ),
285+ column_field_path = os .path .join (column_field_dir , resource + ".csv" ),
286+ dataset_resource_path = os .path .join (dataset_resource_dir , resource + ".csv" ),
287+ converted_resource_path = os .path .join (converted_resource_dir , resource + ".csv" ),
288+ )
392289
393- issue_log .apply_entity_map ()
394- issue_log .save (os .path .join (issue_dir , resource + ".csv" ))
290+ # Parquet seperate save of issue log
395291 issue_log .save_parquet (os .path .join (output_log_dir , "issue/" ))
396- operational_issue_log . save ( output_dir = operational_issue_dir )
397- if column_field_dir :
398- column_field_log . save ( os . path . join ( column_field_dir , resource + ".csv" ))
399- dataset_resource_log . save ( os . path . join ( dataset_resource_dir , resource + ".csv" ))
400- converted_resource_log . save ( os . path . join ( converted_resource_dir , resource + ".csv" ))
292+
293+ # create converted parquet in the var directory
294+ cache_dir = Path ( organisation_path ). parent
295+ transformed_parquet_dir = cache_dir / "transformed_parquet" / dataset
296+
401297 # create converted parquet in the var director
402298 # TODO test without output_path conversation above to make sure we have a test that would've failed
403299 transformed_parquet_dir = output_path .parent
0 commit comments