2828
2929from alphapy .__main__ import main_pipeline
3030from alphapy .frame import load_frames
31+ from alphapy .frame import sequence_frame
3132from alphapy .frame import write_frame
32- from alphapy .globals import SSEP , USEP
33+ from alphapy .globals import SSEP , TAG_ID , USEP
3334from alphapy .utilities import subtract_days
3435
3536from datetime import timedelta
@@ -133,7 +134,7 @@ def __str__(self):
133134# Function run_analysis
134135#
135136
136- def run_analysis (analysis , forecast_period , leaders ,
137+ def run_analysis (analysis , lag_period , forecast_period , leaders ,
137138 predict_history , splits = True ):
138139 r"""Run an analysis for a given model and group.
139140
@@ -147,10 +148,14 @@ def run_analysis(analysis, forecast_period, leaders,
147148 ----------
148149 analysis : alphapy.Analysis
149150 The analysis to run.
151+ lag_period : int
152+ The number of lagged features for the analysis.
150153 forecast_period : int
151154 The period for forecasting the target of the analysis.
152155 leaders : list
153156 The features that are contemporaneous with the target.
157+ predict_history : int
158+ The number of periods required for lookback calculations.
154159 splits : bool, optional
155160 If ``True``, then the data for each member of the analysis
156161 group are in separate files.
@@ -185,7 +190,11 @@ def run_analysis(analysis, forecast_period, leaders,
185190 train_date = model .specs ['train_date' ]
186191
187192 # Calculate split date
193+ logger .info ("Analysis Dates" )
188194 split_date = subtract_days (predict_date , predict_history )
195+ logger .info ("Train Date: %s" , train_date )
196+ logger .info ("Split Date: %s" , split_date )
197+ logger .info ("Test Date: %s" , predict_date )
189198
190199 # Load the data frames
191200 data_frames = load_frames (group , directory , extension , separator , splits )
@@ -203,20 +212,24 @@ def run_analysis(analysis, forecast_period, leaders,
203212 # Subset each individual frame and add to the master frame
204213
205214 for df in data_frames :
215+ try :
216+ tag = df [TAG_ID ].unique ()[0 ]
217+ except :
218+ tag = 'Unknown'
219+ first_date = df .index [0 ]
206220 last_date = df .index [- 1 ]
207- # shift the target for the forecast period
208- if forecast_period > 0 :
209- df [target ] = df [target ].shift (- forecast_period )
210- # shift any leading features if necessary
211- if leaders :
212- df [leaders ] = df [leaders ].shift (- 1 )
221+ logger .info ("Analyzing %s from %s to %s" , tag , first_date , last_date )
222+ # sequence leaders, laggards, and target(s)
223+ df = sequence_frame (df , target , leaders , lag_period , forecast_period ,
224+ exclude_cols = [TAG_ID ])
213225 # get frame subsets
214226 if predict_mode :
215227 new_predict = df .loc [(df .index >= split_date ) & (df .index <= last_date )]
216228 if len (new_predict ) > 0 :
217229 predict_frame = predict_frame .append (new_predict )
218230 else :
219- logger .info ("A prediction frame has zero rows. Check prediction date." )
231+ logger .info ("Prediction frame %s has zero rows. Check prediction date." ,
232+ tag )
220233 else :
221234 # split data into train and test
222235 new_train = df .loc [(df .index >= train_date ) & (df .index < split_date )]
@@ -225,12 +238,20 @@ def run_analysis(analysis, forecast_period, leaders,
225238 train_frame = train_frame .append (new_train )
226239 new_test = df .loc [(df .index >= split_date ) & (df .index <= last_date )]
227240 if len (new_test ) > 0 :
241+ # check if target column has NaN values
242+ nan_count = df [target ].isnull ().sum ()
243+ forecast_check = forecast_period - 1
244+ if nan_count != forecast_check :
245+ logger .info ("%s has %d records with NaN targets" , tag , nan_count )
246+ # drop records with NaN values in target column
228247 new_test = new_test .dropna (subset = [target ])
248+ # append selected records to the test frame
229249 test_frame = test_frame .append (new_test )
230250 else :
231- logger .info ("A testing frame has zero rows. Check prediction date." )
251+ logger .info ("Testing frame %s has zero rows. Check prediction date." ,
252+ tag )
232253 else :
233- logger .warning ( "A training frame has zero rows. Check data source." )
254+ logger .info ( "Training frame %s has zero rows. Check data source.", tag )
234255
235256 # Write out the frames for input into the AlphaPy pipeline
236257
0 commit comments