CommittedTeam
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 1 deletion b/‎.gitignore‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 100 additions & 1 deletion b/‎README.md‎
Lines changed: 100 additions & 1 deletion
diff --git a/‎commitcanvas_models/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎commitcanvas_models/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎commitcanvas_models/commitcanvas.py‎
Lines changed: 117 additions & 0 deletions b/‎commitcanvas_models/commitcanvas.py‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎commitcanvas_models/data_experiments/classification_reports/60_40.csv‎
Lines changed: 55 additions & 0 deletions b/‎commitcanvas_models/data_experiments/classification_reports/60_40.csv‎
Lines changed: 55 additions & 0 deletions
@@ -6,11 +6,15 @@ __pycache__/
 # C extensions
 *.so
 
+# model and data
+angular_data.ftr
+model/
+
 # Distribution / packaging
 .Python
 build/
-develop-eggs/
 dist/
+develop-eggs/
 downloads/
 eggs/
 .eggs/
 
@@ -1 +1,100 @@
-# commitcanvas-models
+# commitcanvas-models
+
+## Set up the environment
+
+- clone the repository.
+- Download the data `angular_data.ftr` from releases. Place the file inside `comitcanvas_models/data`
+- Install [poetry](https://python-poetry.org/) or ensure that you have all the dependencies listed in `pyproject.toml` installed
+- Install the dependencies `poetry install`
+- Enter poetry shell `poetry shell`
+
+## Commands for experiments
+
+### Train the model
+
+- `commitcanvas experiment <mode> <save_report> <split>`
+
+   - `mode` can be either `project` or `cross_project`. If `project` is selected then each repository listed in `data/training_data/training_repo.csv` will be split into train and test sets for experimentation. If `cross_project` is selected then each repository listed in `data/training_data/training_repo.csv` will be cross-project validated.
+
+   - `save_report` path to save the report
+
+   - `split` option takes the ratio. This ratio will be used as a size of the test set for the `project` mode. By default the value of `split` is 0.25. This means that if one project has 800 commits then top(chronologically newest) 200 will be set aside for testing, and the rest 600 will be used for training.
+
+sample usage: `commitcanvas experiment project data_experiments/raw_predictions/project/90_10.csv --split 0.10`
+
+### Create classification reports
+
+`commitcanvas report <data_path>`
+
+input: `data_path` path to the raw data that has true and predicted labels. The data is located in `data_experimnets/raw_predictions`
+
+output: the generated file will include weighted precision, recall, f-1, size of test set, size of train set and the size of total set for each label in each project. The file will be saved in `data_experimnets/classification_reports`. The file name will be same as the input file name.
+
+sample usage:
+
+Report for Project-specific with 60/40 train test split
+
+`commitcanvas report 80_20.csv`
+
+Report for project agnostic
+
+`commitcanvas report project_agnostic.csv`
+
+
+### Run statistical tests
+
+`commitcanvas mwu <path1> <path2>`
+
+Input: Paths to the classification report files.
+
+Output: Mann-Whitney U Test and Vargha and Delaney effect size for precision, recall and f-1 scores
+
+sample usage:
+
+Project-specific split 75/25 vs Project-specific split 80/20
+`commitcanvas mwu 75_25.csv 80_20.csv`
+
+Project-agnostic vs Project-specific 80/20
+`commitcanvas mwu project_agnostic.csv 80_20.csv`
+
+### Generate plots
+
+- `commitcanvas plots <report_path> <save> <title>`
+
+Input:
+
+  - `path` path to the file with specific split configuration
+  - `save` name of the file where the plot will be saved
+  - `title` title of the plot
+
+Output:
+
+confusion matrix and the boxplot will be saved in `data_experiments/plots`
+
+boxplot stats such as median, mean, whishi, whislo. And get projects names near those values
+
+sample usage:
+
+`commitcanvas plots 80_20.csv project_specific_80_20.pdf --title "Project-Specific (80/20)"`
+
+
+## Data Overview
+
+### Project metadata
+
+List of repositories that follow angular's conventional commit guidelines.
+The files also includes other meta data such as dominant programming language,criticality score etc.
+
+[repositories for deployed model](data_experiemnts/projects_metadata/angualr_repos.csv)
+
+[repositories for experimentation](data_experiemnts/projects_metadata/training_repos.csv)
+
+### Data stats
+
+The projects for deployed model have crticality score higher than 0.60. The projects have commits that follow conventional commit standard
+- total number of projects for deployed model: 304
+- total number of commits for training the deployed model: 515643
+
+The projects for experimentation have crticality score higher than 0.60. And have at least 50 commits per label `chore`, `docs` `feat` `fix` `refactor` `test`
+- total number of projects for experimentation: 54
+- total number of commits used in experimentation: 213192
@@ -0,0 +1 @@
+__version__ = '0.1.0'
@@ -0,0 +1,117 @@
+"""Training and evaluating the model."""
+import typer
+from commitcanvas_models.train_model import model as md
+from commitcanvas_models.data_handling import helpers
+from commitcanvas_models.data_handling import statistics
+import pandas as pd
+import pingouin as pg
+import joblib
+
+app = typer.Typer()
+
+@app.callback()
+def callback():
+    """
+    please see the documentation regarding acceptable command line options
+    """
+
+@app.command()
+def select(labels: str = "fix,feat,chore,docs,refactor,test", min_label: int = 50, subset: bool = False, subset_size: float = 0.50):
+    """Select and save repositories for training"""
+    data = pd.read_feather("data/angular_data.ftr")
+    # select repositories that use the given labels and have at least given amount of commits per label
+    filtered = helpers.filter_projects_by_label(data,labels,min_label)
+    if subset:
+        selected_set = helpers.select_projects_subset(filtered,subset_size)
+    else:
+        selected_set = helpers.map_language_to_name(filtered)
+
+    # drop languages that have only one project
+    filtered = helpers.drop_by_language(selected_set,1).reset_index(drop=True)
+
+    # save the repository names for training
+    filtered.to_csv("data_experiments/projects_metadata/training_repos.csv")
+
+
+    print("\nSelected labels: ", labels)
+    print("Minimum amount of commits required per label: ",min_label)
+    if subset:
+        print("ratio of subset: ", subset_size)
+    print("\nTotal number of subset repositories",len(filtered.name))
+    print(filtered.name)
+    print("\nCount of programming languages")
+    print(filtered.language.value_counts())
+
+@app.command()
+def train(data:str, save:str, types:str = "chore,docs,feat,fix,refactor,test"):
+    '''
+    Train the pipeline for deployment
+    '''
+    collected_data = pd.read_feather(data)
+    print(collected_data)
+    processed_data = md.data_prep(collected_data, types)
+    print(processed_data)
+    train_features,train_labels = md.feature_label_split(processed_data)
+
+    pipeline = md.build_pipline()
+    pipeline = pipeline.fit(train_features, train_labels)
+
+    print("saving the model")
+    joblib.dump(pipeline, "{}/trained_model.pkl".format(save))
+    print("saving model complete")
+
+    return pipeline
+
+@app.command()
+def experiment(mode: str,  save_report: str, split: float = 0.25):
+    
+    valid_modes = ['project','cross_project']
+    if mode not in valid_modes:
+        typer.echo("\nInvalid mode: {}. Valid modes are <project> and <cross_project. Please see the documentation for more details\n".format(mode))
+        raise typer.Exit()
+
+    filtered_data = md.select_training_data()
+    md.report(filtered_data,mode,split,save_report)
+
+@app.command()
+def report(data_path):
+
+    data = pd.read_csv("data_experiments/raw_predictions/{}".format(data_path))
+    projects = data.name.unique()
+
+    # collect classification report for each project
+    report = []
+    for project in projects:
+
+        project_data = data[data["name"]==project]
+        report.append(statistics.commitcanvas_classification_report(project_data,project))
+      
+    reports = pd.DataFrame(report)
+    combined = reports.merge(statistics.get_training_set_count(data))
+    print(combined)
+    # save the report
+    combined.to_csv("data_experiments/classification_reports/{}".format(data_path))
+
+@app.command()
+def plots(report_path, save, title:str = None):
+    data_confusion_matrix = pd.read_csv("data_experiments/raw_predictions/{}".format(report_path),index_col=0)
+    data_boxplot = pd.read_csv("data_experiments/classification_reports/{}".format(report_path),index_col=0)
+    # box and whisker plots
+    statistics.boxplot(data_boxplot,save,title = title)
+    # plot the confusion matrix
+    statistics.plot_confusion_matrix(data_confusion_matrix,save,title = title)
+
+@app.command()
+def mwu(path1: str, path2: str):
+    scores = ['precision','recall','fscore']
+    data1 = pd.read_csv("data_experiments/classification_reports/{}".format(path1))
+    data2 = pd.read_csv("data_experiments/classification_reports/{}".format(path2))
+    for score in scores:
+        mwu_results = pg.mwu(data1[score], data2[score], tail='two-sided')
+        print("\n Result for {}".format(score))
+        print(mwu_results)
+
+
+
+    
+
@@ -0,0 +1,55 @@
+,name,precision,recall,fscore,chore_total,docs_total,feat_total,fix_total,refactor_total,test_total,chore_train,docs_train,feat_train,fix_train,refactor_train,test_train,chore_test,docs_test,feat_test,fix_test,refactor_test,test_test
+0,sequelize,0.6624837512940401,0.653448275862069,0.6362581582405799,156,335,184,603,114,59,139,152,133,361,55,31,17,183,51,242,59,28
+1,ionic-framework,0.7116383238141103,0.6850899742930592,0.6861692421972126,1759,1300,716,2820,665,520,1142,895,479,1310,510,332,617,405,237,1510,155,188
+2,element3,0.529872412851264,0.5445920303605313,0.5232537700578956,387,195,57,337,244,98,234,151,39,178,123,66,153,44,18,159,121,32
+3,vite,0.6533158032847588,0.6435374149659864,0.6267316840381845,381,184,300,757,152,63,263,76,206,409,99,49,118,108,94,348,53,14
+4,vee-validate,0.7495467164154344,0.7384615384615385,0.7334536711306898,449,446,273,263,123,71,218,248,183,178,94,54,231,198,90,85,29,17
+5,stencil,0.5944581452967596,0.6075238629983155,0.5787022503946988,1165,123,590,1771,322,481,756,65,328,948,253,321,409,58,262,823,69,160
+6,pgjdbc,0.748357264513432,0.7178423236514523,0.6994033290422766,108,73,53,224,60,85,53,34,38,130,48,59,55,39,15,94,12,26
+7,ant-design,0.7601453720548141,0.7608257804632427,0.7455864288952277,635,1603,591,1653,238,246,361,957,359,1032,125,146,274,646,232,621,113,100
+8,rest.js,0.6735560237802094,0.53125,0.5710803447341212,100,238,121,206,76,219,99,93,64,128,47,145,1,145,57,78,29,74
+9,gatsby,0.6839417225971323,0.6657681940700808,0.6690916730596588,3898,869,1090,2307,104,80,2240,659,760,1255,49,46,1658,210,330,1052,55,34
+10,instantsearch.js,0.525481362130494,0.5393494228751312,0.5255961829574801,804,380,433,606,65,95,527,262,216,334,23,68,277,118,217,272,42,27
+11,commitlint,0.635885684096244,0.6695906432748538,0.617997660199331,304,183,81,180,54,53,205,112,52,103,20,21,99,71,29,77,34,32
+12,aks-engine,0.6923122171062928,0.6967963386727689,0.6915727727671531,604,185,456,542,84,315,327,121,269,358,45,192,277,64,187,184,39,123
+13,angular,0.7231729520317296,0.6898098036183702,0.675295123727446,1098,3766,2394,5065,2698,1146,1097,1867,1794,3071,1423,448,1,1899,600,1994,1275,698
+14,angular.js,0.7770211201648595,0.7782397782397782,0.7511219767866368,1238,2938,587,1779,385,288,751,1742,391,1101,209,135,487,1196,196,678,176,153
+15,vue-test-utils,0.7310814231144758,0.6906474820143885,0.6962035843724118,117,418,121,238,78,70,39,254,80,137,60,55,78,164,41,101,18,15
+16,material-components-web,0.5616105444710685,0.5574886535552194,0.4974851190489377,692,649,690,912,212,150,490,503,357,543,70,20,202,146,333,369,142,130
+17,zeebe,0.7147011176494454,0.7673054360578804,0.7160815958045137,4241,236,714,745,85,371,2292,114,566,515,40,308,1949,122,148,230,45,63
+18,RSSHub,0.8266500447980617,0.8158347676419966,0.8146301562109642,94,376,1398,907,50,79,49,307,754,515,45,72,45,69,644,392,5,7
+19,loopback-next,0.8065018724073671,0.8033707865168539,0.798144350207166,2540,513,831,764,144,104,1306,364,548,532,122,66,1234,149,283,232,22,38
+20,renovate,0.6291328473637532,0.6294287780187997,0.6006049375507068,1034,444,1215,2982,1113,128,587,223,684,1939,673,44,447,221,531,1043,440,84
+21,angular-cli,0.7019689316835042,0.6204732013520039,0.6106212543780302,359,498,906,2133,773,508,359,375,632,1240,285,215,0,123,274,893,488,293
+22,deno,0.6728211159101437,0.6413881748071979,0.6179048941362046,187,218,412,722,324,82,74,117,281,435,204,56,113,101,131,287,120,26
+23,ng-bootstrap,0.640997178793396,0.5062695924764891,0.4504725226004722,439,176,309,410,143,119,313,86,199,227,99,34,126,90,110,183,44,85
+24,serverless,0.7583243093112527,0.758800521512386,0.7563494560216293,331,249,252,428,433,224,223,189,159,250,185,144,108,60,93,178,248,80
+25,vuetify,0.8064986338281865,0.7972027972027972,0.7963738501749174,837,2556,523,2177,501,199,502,1060,399,1494,442,179,335,1496,124,683,59,20
+26,camunda-bpm-platform,0.6615077171755027,0.6573821989528795,0.6549274032976633,5180,58,2201,3471,294,734,2708,34,1351,2282,284,504,2472,24,850,1189,10,230
+27,node-mongodb-native,0.6306145867966505,0.6199004975124378,0.6114557752179993,306,191,223,521,585,686,157,94,158,286,362,450,149,97,65,235,223,236
+28,components,0.742758832760195,0.7128598848368523,0.6805673515562437,1125,761,943,3001,537,146,898,427,546,1763,223,51,227,334,397,1238,314,95
+29,js-ipfs,0.7750979421677167,0.7215033887861984,0.7293778117137204,2192,403,461,702,107,192,1247,222,329,444,46,146,945,181,132,258,61,46
+30,goreleaser,0.7065303458385616,0.7115716753022453,0.6974560313158058,194,318,275,473,76,111,116,170,139,282,63,98,78,148,136,191,13,13
+31,pnpm,0.6783456346097614,0.6116974494283202,0.6262141436509687,2099,249,747,1691,545,353,1345,168,515,862,350,170,754,81,232,829,195,183
+32,vue-cli,0.7386901295781246,0.7263257575757576,0.7169938516289083,553,381,478,950,135,143,279,191,332,591,96,95,274,190,146,359,39,48
+33,axe-core,0.7271921848761431,0.6996197718631179,0.6692666716251905,318,119,225,527,59,66,208,70,140,296,17,57,110,49,85,231,42,9
+34,verdaccio,0.5712018098053099,0.5037481259370314,0.4953960198707776,690,131,132,318,312,84,320,120,50,207,245,58,370,11,82,111,67,26
+35,webiny-js,0.6529568633414448,0.6594134342478714,0.6235719722365354,1520,60,1460,3906,510,471,1023,58,772,2354,272,277,497,2,688,1552,238,194
+36,vant,0.7916668942276838,0.7479508196721312,0.7371093980409599,905,839,904,763,105,145,440,452,712,507,62,24,465,387,192,256,43,121
+37,chakra-ui,0.6191141560602956,0.5081658291457286,0.4638281151975006,1745,742,344,764,253,132,1215,372,140,378,182,101,530,370,204,386,71,31
+38,G2,0.6462621531935653,0.6136919315403423,0.5875578859883093,461,94,494,743,143,110,259,18,298,492,93,67,202,76,196,251,50,43
+39,rxjs,0.6298569076080791,0.6051587301587301,0.6040690788418873,852,667,405,671,675,509,507,324,330,393,363,350,345,343,75,278,312,159
+40,super-productivity,0.669648252505717,0.702493551160791,0.6758808618121993,391,99,3393,1206,626,101,291,35,2087,727,341,9,100,64,1306,479,285,92
+41,electron-builder,0.6399384145815513,0.6524547803617571,0.624618244568085,151,173,567,872,98,74,57,123,378,472,75,56,94,50,189,400,23,18
+42,serenity-js,0.6269087388389237,0.6238738738738738,0.6037743758505008,306,217,175,274,56,83,191,127,123,149,41,36,115,90,52,125,15,47
+43,reaction,0.5852784652196664,0.5562798783137766,0.5554178433675837,811,180,1299,1399,1590,473,410,104,851,1020,853,213,401,76,448,379,737,260
+44,stryker,0.5131043622532878,0.5099778270509978,0.46874511538071145,228,102,330,261,91,116,155,40,199,171,56,56,73,62,131,90,35,60
+45,lerna,0.6305658107163694,0.5948275862068966,0.5674107392386014,441,86,265,356,177,125,284,46,123,229,120,68,157,40,142,127,57,57
+46,influxdb,0.6421667716802465,0.6090909090909091,0.5793094623255267,1028,81,1543,2028,361,184,498,60,923,1307,222,125,530,21,620,721,139,59
+47,superset,0.47816699200959656,0.5481569560047562,0.4749609285234071,358,141,482,881,134,106,202,110,300,554,80,15,156,31,182,327,54,91
+48,Semantic-UI-React,0.7421787023363412,0.7144866385372715,0.7171400390584501,313,550,252,529,66,68,189,298,163,298,62,57,124,252,89,231,4,11
+49,karma,0.6557938329162107,0.5908346972176759,0.5870768058951481,554,177,207,381,128,81,373,126,127,180,44,67,181,51,80,201,84,14
+50,fxa,0.624137503796885,0.637400228050171,0.6062970993237998,1633,51,1968,4358,710,51,938,21,1123,2701,464,16,695,30,845,1657,246,35
+51,jina,0.6083077496352457,0.5810097965335342,0.5815224599877651,461,465,649,971,474,298,223,297,436,631,255,149,238,168,213,340,219,149
+52,material,0.6821059935590033,0.7006802721088435,0.6633657047420202,318,489,389,1892,149,71,210,276,234,1092,115,58,108,213,155,800,34,13
+53,taro,0.644536844122211,0.6352228472522717,0.6264091358623746,997,479,1253,2611,308,129,564,362,864,1422,152,102,433,117,389,1189,156,27