-
Notifications
You must be signed in to change notification settings - Fork 99
Pull request for recommender system framework over GraphChi for review #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 41 commits
75726ce
f33479c
558482c
15a5a46
bbaad36
e71eeee
d784996
bb3565b
1f25ecc
8eb9ad9
d0ec864
9cf62fe
13f626c
ef5a105
b08b520
6ced3f3
afa50d3
e117013
db8219f
08e3b46
fe9fb8e
8fe5271
252ed2d
552723c
e9e1919
d19b350
7388ac1
a3121d9
d520eb6
2d1c58f
1efc5bf
0df95f5
0090c72
03915ab
b959e33
525d317
d8604b0
2feb4e9
65c34e6
0473413
f7cabb2
0278e5b
df28228
5834f69
6fa3d14
27a8fd3
f00f26f
4fb1070
60d2071
71deb4d
2f74ea3
b49343a
98157b8
c3a0fde
f35863c
adafd27
3a245c7
b43d968
4128c65
c57605a
10ae999
d111685
0fb5016
687aa07
e24f682
2c781dd
1fcf2bb
a0bf4c5
102b41b
61ef118
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,177 @@ | ||
| from optparse import OptionParser | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This file can be ignored right now. It is specific to parsing a dataset |
||
| import csv | ||
| import simplejson | ||
|
|
||
| DELIM = '\t' | ||
|
|
||
| FEATURE_JSON_FORMAT = ("{\n"+ | ||
| " file_name : <file_location>\n"+ | ||
| " delim : <delimiter, default = \\t>\n"+ | ||
| " num : <num_users>\n"+ | ||
| " delete_cols : [<List of columns to not consider>]\n"+ | ||
| " multiple_feature_delim : <default = ','>\n"+ | ||
| " numerical_attr : [<list of numerical attributes>]\n" | ||
| "}") | ||
|
|
||
| MULTIPLE_FEATURE_DELIM = ',' | ||
|
|
||
|
|
||
| def parse_command_line(): | ||
| parser = OptionParser(usage="python convert_to_mm.py -g=<graph-file> -e=<num_edges>" | ||
| + "[other optional options]") | ||
|
|
||
| # Information about the graph file | ||
| parser.add_option("-g", "--graph-file", action="store", type="string", dest="graph_file", | ||
| help="The file containing the graph(<inedge> <out-edge> <edge-val>") | ||
| parser.add_option("-e", "--num_edges", action="store", type="int", dest="num_edges", | ||
| help="Number of edges in the graph file") | ||
|
|
||
| #Information about the user file | ||
| parser.add_option("-u", "--user_file_info", action="store", type="string", dest="user_file_info", | ||
| help=("Json String containing required information about the user feature file." + | ||
| " The format of JSON is as follows: \n") + FEATURE_JSON_FORMAT) | ||
|
|
||
| #Information about the item file | ||
| parser.add_option("-i", "--item_file_info", action="store", type="string", dest="item_file_info", | ||
| help=("Json String containing required information about the item feature file." + | ||
| " The format of JSON is as follows: \n") + FEATURE_JSON_FORMAT) | ||
|
|
||
| return parser.parse_args() | ||
|
|
||
|
|
||
| def update_vertex_map_from_graph(graph_file_name, user_mapping, item_mapping): | ||
| num_edges = 0 | ||
| #Go through the graph file and compute the user and item maps | ||
| uniq_user_counter = len(user_mapping) + 1 | ||
| uniq_item_counter = len(item_mapping) + 1 | ||
|
|
||
| with open(graph_file_name, 'r') as graph_file: | ||
| reader = csv.reader(graph_file, delimiter=DELIM) | ||
| for row in reader: | ||
| num_edges += 1 | ||
| user = user_mapping.get(row[0], None) | ||
| if user is None: | ||
| user_mapping[row[0]] = uniq_user_counter | ||
| uniq_user_counter = uniq_user_counter + 1 | ||
|
|
||
| item = item_mapping.get(row[1], None) | ||
| if item is None: | ||
| item_mapping[row[1]] = uniq_item_counter | ||
| uniq_item_counter = uniq_item_counter + 1 | ||
|
|
||
| return num_edges | ||
|
|
||
| def convert_to_matrix_market(graph_file_name, user_mapping, item_mapping): | ||
|
|
||
| num_edges = update_vertex_map_from_graph(graph_file_name, user_mapping, item_mapping) | ||
|
|
||
| with open(graph_file_name, 'r') as graph_file: | ||
| out_file = open(graph_file_name + ".mm", 'w') | ||
| out_file.write("%%MatrixMarket matrix coordinate real general\n"); | ||
| out_file.write("% Generated on <DATE>\n"); | ||
| out_file.write(str(len(user_mapping)) + ' ' + str(len(item_mapping)) + ' ' + str(num_edges) + '\n') | ||
|
|
||
| reader = csv.reader(graph_file, delimiter=DELIM) | ||
| for row in reader: | ||
| user = user_mapping.get(row[0], None) | ||
| if user is None: | ||
| user = uniq_vertex_count | ||
| user_mapping[row[0]] = user | ||
| uniq_vertex_count = uniq_vertex_count + 1 | ||
|
|
||
| item = item_mapping.get(row[1], None) | ||
| if item is None: | ||
| item = item_count | ||
| item_mapping[row[1]] = item | ||
| uniq_item_count = item_count + 1 | ||
|
|
||
| out_file.write(str(user) + ' ' + str(item) + ' ' + row[2] + '\n') | ||
|
|
||
| return {'num_edges': num_edges, 'num_features': 0} | ||
|
|
||
|
|
||
| def parse_vertex_features(vertex_mapping, feature_file_info_str): | ||
| feature_file_info = simplejson.loads(feature_file_info_str) | ||
|
|
||
| multiple_feature_delim = feature_file_info.get("multiple_feature_delim", MULTIPLE_FEATURE_DELIM) | ||
|
|
||
| uniq_vertex_count = len(vertex_mapping) + 1 | ||
| feature_count = 1 | ||
| feature_mapping = {} | ||
|
|
||
| with open(feature_file_info["file_name"], 'r') as feature_file: | ||
| user_out_file = open(feature_file_info["file_name"] + ".conv", 'w') | ||
| reader = csv.reader(feature_file, delimiter=DELIM) | ||
| for row in reader: | ||
| vertex = vertex_mapping.get(row[0], None) | ||
|
|
||
| #If this vertex was not seen in the actual file. | ||
| if vertex is None: | ||
| vertex_mapping[row[0]] = uniq_vertex_count | ||
| uniq_vertex_count += 1 | ||
|
|
||
| out_str = str(vertex_mapping[row[0]]) | ||
|
|
||
| for i in range(1, len(row)): | ||
| if "delete_cols" in feature_file_info and i in feature_file_info.delete_cols: | ||
| continue | ||
|
|
||
| #Add numerical attribute | ||
| if "numerical_attr" in feature_file_info and i in feature_file_info.numerical_attr: | ||
| feature_label = feature_mapping.get((i, 0), None) | ||
| if feature_label is None: | ||
| feature_mapping[(i, 0)] = feature_count | ||
| feature_label = feature_count | ||
| feature_count += 1 | ||
| out_str = out_str + DELIM + str(feature_label) + ":" + row[i] | ||
| continue | ||
|
|
||
| # Add categorical attribute | ||
| feature_values = row[i].split(multiple_feature_delim) | ||
| for val in feature_values: | ||
| feature_label = feature_mapping.get((i, val), None) | ||
| if feature_label is None: | ||
| feature_mapping[(i, val)] = feature_count | ||
| feature_label = feature_count | ||
| feature_count += 1 | ||
| out_str = out_str + DELIM + str(feature_label) + ":1" | ||
|
|
||
| #Write the out_str to the output file | ||
| user_out_file.write(out_str + '\n') | ||
|
|
||
| return {'num_entries': len(vertex_mapping), 'num_features': feature_count} | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
| (options, args) = parse_command_line() | ||
| #print options, args | ||
|
|
||
| user_mapping = {} | ||
| users_info = {} | ||
| if hasattr(options, "user_file_info"): | ||
| users_info = parse_vertex_features(user_mapping, options.user_file_info) | ||
|
|
||
| item_mapping = {} | ||
| items_info = {} | ||
| if hasattr( options, "item_file_info"): | ||
| items_info = parse_vertex_features(item_mapping, options.item_file_info) | ||
|
|
||
| graph_info = convert_to_matrix_market(options.graph_file, user_mapping, item_mapping) | ||
|
|
||
| with open(options.graph_file + ".info", 'w') as f: | ||
| f.write( | ||
| simplejson.dumps( | ||
| { | ||
| 'num_users': len(user_mapping), | ||
| 'num_user_features': users_info.get('num_features', 0), | ||
| 'num_items': len(item_mapping), | ||
| 'num_item_features': items_info.get('num_features', 0), | ||
| 'num_edge_features': graph_info.get('num_features', 0), | ||
| 'num_edges': graph_info.get('num_edges', 0), | ||
| 'user_mapping': user_mapping, | ||
| 'item_mapping': item_mapping | ||
| } | ||
| ) | ||
| ) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| from optparse import OptionParser | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This file can be ignored. Specific to a dataset parsing |
||
| import csv | ||
| import simplejson | ||
|
|
||
| DELIM = '\t' | ||
|
|
||
| MULTIPLE_FEATURE_DELIM = ',' | ||
|
|
||
| month_mapping = {"Jan":1, "Feb":2, "Mar":3, "Apr":4, "May":5, "Jun":6, "Jul":7, "Aug":8, "Sep":9, "Oct":10, "Nov":11, "Dec":12} | ||
|
|
||
| def parse_command_line(): | ||
| parser = OptionParser(usage="python lastFM_user_feature.py -u=<user-file> -a=<age_bin_interval> -dy=<date_bin_year> -dm=<date_bin_month> -dd=<date_bin_day>" | ||
| + "[other optional options]") | ||
|
|
||
| # Information about the user file | ||
| parser.add_option("-u", "--user-file", action="store", type="string", dest="user_file", | ||
| help="The file containing the user features") | ||
|
|
||
| # Information about the bin segmentation info | ||
| parser.add_option("-a", "--age_bin_interval", action="store", type="int", dest="age_interval",default = 5, | ||
| help="The interval of an age bin") | ||
|
|
||
| parser.add_option("-y", "--date_bin_year", action="store", type="int", dest="year_interval", default = 0, | ||
| help="The interval of date bin on year") | ||
|
|
||
| parser.add_option("-m", "--date_bin_month", action="store", type="int", dest="month_interval", default = 0, | ||
| help="The interval of date bin on month") | ||
|
|
||
| parser.add_option("-d", "--date_bin_day", action="store", type="int", dest="day_interval", default = 0, | ||
| help="The interval of date bin on day") | ||
|
|
||
| return parser.parse_args() | ||
|
|
||
|
|
||
| def date_key_conversion(date, year_interval, month_interval, day_interval): | ||
| date_format = date.replace(',',' ').split() | ||
| year = int(date_format[2]) | ||
| month = month_mapping[date_format[0]] | ||
| day = int(date_format[1]) | ||
| if day_interval != 0: | ||
| key = str(year) + ' ' + str(month) + ' ' + str(day / day_interval) | ||
| elif month_interval != 0: | ||
| key = str(year) + ' ' + str(month / month_interval) | ||
| elif year_interval != 0: | ||
| key = str(year / year_interval) | ||
| else: #Not specified, each day an independent bin | ||
| key = str(year) +' ' + str(month) + ' ' + str(day) | ||
| return key | ||
|
|
||
| def age_key_conversion(age, age_interval): | ||
| if age == '': | ||
| return age | ||
| age_numeric = int(age) | ||
| if age_interval != 0: | ||
| key = str(age_numeric / age_interval) | ||
| else: | ||
| key = age | ||
| return key | ||
|
|
||
| def parse_user_features(user_feature_file, age_interval, year_interval, month_interval, day_interval): | ||
|
|
||
| with open(user_feature_file, 'r') as feature_file: | ||
| user_out_file = open(user_feature_file + "_age"+ str(age_interval)+"_"+str(year_interval)+"y"+str(month_interval)+"m"+str(day_interval)+"d"+".conv", 'w') | ||
| reader = csv.reader(feature_file, delimiter=DELIM) | ||
| for row in reader: | ||
|
|
||
| age_key = age_key_conversion(row[2],age_interval) | ||
| date_key = date_key_conversion(row[4], year_interval, month_interval, day_interval) | ||
| out_str = row[0] + DELIM + row[1] + DELIM + age_key + DELIM + row[3] + DELIM + date_key | ||
|
|
||
| #Write the out_str to the output file | ||
| user_out_file.write(out_str + '\n') | ||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
| (options, args) = parse_command_line() | ||
| #print options, args | ||
|
|
||
| graph_info = parse_user_features(options.user_file, options.age_interval, options.year_interval, options.month_interval, options.day_interval) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| import sys | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can be ignored |
||
| import csv | ||
|
|
||
| DELIM = "|" | ||
| OUT_DELIM = '\t' | ||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
| with open(sys.argv[1], 'r') as user_file: | ||
| out_file = open(sys.argv[1] + ".processed", 'w') | ||
| reader = csv.reader(user_file, delimiter=DELIM) | ||
| for row in reader: | ||
| out_str = row[0] + OUT_DELIM | ||
|
|
||
| for i in range(5, len(row)): | ||
| if row[i] == '1': | ||
| out_str = out_str + str(i) + "," | ||
| if out_str[-1] == ',': | ||
| out_str = out_str[:-1] | ||
|
|
||
| out_file.write(out_str + '\n') | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to fix the indentation. It is pretty screwed up here