GraphChi · MohtaMayank · Sep 20, 2013 · Sep 20, 2013 · Sep 22, 2013 · Sep 22, 2013
diff --git a/pom.xml b/pom.xml
@@ -14,15 +14,18 @@
         <name>Sonatype Nexus Snapshots</name>
         <url>https://oss.sonatype.org/content/repositories/snapshots/</url>
         <snapshots><enabled>true</enabled></snapshots>
-
     </repository>
-  <repository>  
+  	<repository>  
       <id>scala-tools.org</id>  
       <name>Scala-tools Maven2 Repository</name>  
       <url>http://scala-tools.org/repo-releases</url>  
     </repository>
+    <repository>  
+      <id>ApacheReleases</id>  
+      <name>Apache release repository</name>  
+      <url>https://repository.apache.org/content/repositories/releases</url>  
+    </repository>
   </repositories>
-
     <dependencies>
         <dependency>
         <groupId>com.yammer.metrics</groupId>
@@ -32,35 +35,35 @@
 
 	    <!-- Scala version is very important. Luckily the plugin warns you if you don't specify:   
 	        [WARNING] you don't define org.scala-lang:scala-library as a dependency of the project -->  
-	    <dependency>  
-	      <groupId>org.scala-lang</groupId>  
-	      <artifactId>scala-library</artifactId>  
-	      <version>2.9.0-1</version>  
-	    </dependency>
-        <dependency>
-        	<groupId>mysql</groupId>
-        	<artifactId>mysql-connector-java</artifactId>
-        	<version>5.1.6</version>
-        </dependency>
-	 <dependency>
-	      <groupId>junit</groupId>
-	      <artifactId>junit</artifactId>
-	      <version>4.10</version>
-	      <type>jar</type>
-	      <scope>test</scope>
-	      <optional>true</optional>
-	    </dependency>
- <dependency>
-	<groupId>org.apache.pig</groupId>
-	<artifactId>pig</artifactId>
-	<scope>compile</scope>
-	<version>0.10.0</version>
-</dependency>
-        <dependency>
+    <dependency>  
+	   <groupId>org.scala-lang</groupId>  
+	   <artifactId>scala-library</artifactId>  
+	   <version>2.9.0-1</version>  
+    </dependency>
+    <dependency>
+        <groupId>mysql</groupId>
+        <artifactId>mysql-connector-java</artifactId>
+        <version>5.1.6</version>
+    </dependency>
+    <dependency>
+        <groupId>junit</groupId>
+        <artifactId>junit</artifactId>
+        <version>4.10</version>
+        <type>jar</type>
+        <scope>test</scope>
+        <optional>true</optional>
+    </dependency>
+	    <dependency>
             <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-core</artifactId>
-            <version>0.20.2</version>
+            <artifactId>hadoop-client</artifactId>
+            <version>2.2.0</version>
         </dependency>
+ 	<dependency>
+		<groupId>org.apache.pig</groupId>
+		<artifactId>pig</artifactId>
+		<scope>compile</scope>
+		<version>0.12.0</version>
+	</dependency>
         <dependency>
 	        <groupId>org.apache.commons</groupId>
 	        <artifactId>commons-math</artifactId>
@@ -76,6 +79,16 @@
             <artifactId>commons-cli</artifactId>
             <version>1.2</version>
         </dependency>
+        <dependency>
+			<groupId>org.apache.commons</groupId>
+			<artifactId>commons-math3</artifactId>
+			<version>3.1</version>
+		</dependency>
+		<dependency>
+			<groupId>gov.sandia.foundry</groupId>
+			<artifactId>gov-sandia-cognition-learning-core</artifactId>
+			<version>3.3.3</version>
+		</dependency>
     </dependencies>
 
   	<build>

diff --git a/scripts/convert_to_mm.py b/scripts/convert_to_mm.py
@@ -0,0 +1,177 @@
+from optparse import OptionParser
+import csv
+import simplejson
+
+DELIM = '\t'
+
+FEATURE_JSON_FORMAT = ("{\n"+
+    "    file_name : <file_location>\n"+
+    "    delim : <delimiter, default = \\t>\n"+
+    "    num : <num_users>\n"+
+    "    delete_cols :  [<List of columns to not consider>]\n"+
+    "    multiple_feature_delim : <default = ','>\n"+
+    "    numerical_attr : [<list of numerical attributes>]\n"
+    "}")
+
+MULTIPLE_FEATURE_DELIM = ','
+
+
+def parse_command_line():
+    parser = OptionParser(usage="python convert_to_mm.py -g=<graph-file> -e=<num_edges>"
+        + "[other optional options]")
+
+    # Information about the graph file
+    parser.add_option("-g", "--graph-file", action="store", type="string", dest="graph_file",
+                      help="The file containing the graph(<inedge> <out-edge> <edge-val>")
+    parser.add_option("-e", "--num_edges", action="store", type="int", dest="num_edges",
+                      help="Number of edges in the graph file")
+
+    #Information about the user file
+    parser.add_option("-u", "--user_file_info", action="store", type="string", dest="user_file_info",
+                      help=("Json String containing required information about the user feature file." +
+                      " The format of JSON is as follows: \n") + FEATURE_JSON_FORMAT)
+
+    #Information about the item file
+    parser.add_option("-i", "--item_file_info", action="store", type="string", dest="item_file_info",
+                      help=("Json String containing required information about the item feature file." +
+                      " The format of JSON is as follows: \n") + FEATURE_JSON_FORMAT)
+
+    return parser.parse_args()
+
+
+def  update_vertex_map_from_graph(graph_file_name, user_mapping, item_mapping):
+    num_edges = 0
+    #Go through the graph file and compute the user and item maps
+    uniq_user_counter = len(user_mapping) + 1
+    uniq_item_counter = len(item_mapping) + 1
+
+    with open(graph_file_name, 'r') as graph_file:
+        reader = csv.reader(graph_file, delimiter=DELIM)
+        for row in reader:
+            num_edges += 1
+            user = user_mapping.get(row[0], None)
+            if user is None:
+                user_mapping[row[0]] = uniq_user_counter
+                uniq_user_counter = uniq_user_counter + 1
+
+            item = item_mapping.get(row[1], None)
+            if item is None:
+                item_mapping[row[1]] = uniq_item_counter
+                uniq_item_counter = uniq_item_counter + 1
+
+    return num_edges
+
+def convert_to_matrix_market(graph_file_name, user_mapping, item_mapping):
+
+    num_edges = update_vertex_map_from_graph(graph_file_name, user_mapping, item_mapping)
+
+    with open(graph_file_name, 'r') as graph_file:
+        out_file = open(graph_file_name + ".mm", 'w')
+        out_file.write("%%MatrixMarket matrix coordinate real general\n");
+        out_file.write("% Generated on <DATE>\n");
+        out_file.write(str(len(user_mapping)) + ' ' + str(len(item_mapping)) + ' ' + str(num_edges) + '\n')
+
+        reader = csv.reader(graph_file, delimiter=DELIM)
+        for row in reader:
+            user = user_mapping.get(row[0], None)
+            if user is None:
+                user = uniq_vertex_count
+                user_mapping[row[0]] = user
+                uniq_vertex_count = uniq_vertex_count + 1
+
+            item = item_mapping.get(row[1], None)
+            if item is None:
+                item = item_count
+                item_mapping[row[1]] = item
+                uniq_item_count = item_count + 1
+
+            out_file.write(str(user) + ' ' + str(item) + ' ' + row[2] + '\n')
+
+    return {'num_edges': num_edges, 'num_features': 0}
+
+
+def parse_vertex_features(vertex_mapping, feature_file_info_str):
+    feature_file_info = simplejson.loads(feature_file_info_str)
+
+    multiple_feature_delim = feature_file_info.get("multiple_feature_delim", MULTIPLE_FEATURE_DELIM)
+
+    uniq_vertex_count = len(vertex_mapping) + 1
+    feature_count = 1
+    feature_mapping = {}
+
+    with open(feature_file_info["file_name"], 'r') as feature_file:
+        user_out_file = open(feature_file_info["file_name"] + ".conv", 'w')
+        reader = csv.reader(feature_file, delimiter=DELIM)
+        for row in reader:
+            vertex = vertex_mapping.get(row[0], None)
+
+            #If this vertex was not seen in the actual file.
+            if vertex is None:
+                vertex_mapping[row[0]] = uniq_vertex_count
+                uniq_vertex_count += 1
+
+            out_str = str(vertex_mapping[row[0]])
+
+            for i in range(1, len(row)):
+                if "delete_cols" in feature_file_info and i in feature_file_info.delete_cols:
+                    continue
+
+                #Add numerical attribute
+                if "numerical_attr" in feature_file_info and i in feature_file_info.numerical_attr:
+                    feature_label = feature_mapping.get((i, 0), None)
+                    if feature_label is None:
+                         feature_mapping[(i, 0)] = feature_count
+                         feature_label = feature_count
+                         feature_count += 1
+                    out_str = out_str + DELIM + str(feature_label) + ":" + row[i]
+                    continue
+
+                # Add categorical attribute
+                feature_values = row[i].split(multiple_feature_delim)
+                for val in feature_values:
+                    feature_label = feature_mapping.get((i, val), None)
+                    if feature_label is None:
+                         feature_mapping[(i, val)] = feature_count
+                         feature_label = feature_count
+                         feature_count += 1
+                    out_str = out_str + DELIM + str(feature_label) + ":1"
+
+            #Write the out_str to the output file
+            user_out_file.write(out_str + '\n')
+
+    return {'num_entries': len(vertex_mapping), 'num_features': feature_count}
+
+
+if __name__ == "__main__":
+
+    (options, args) = parse_command_line()
+    #print options, args
+
+    user_mapping = {}
+    users_info = {}
+    if hasattr(options, "user_file_info"):
+        users_info = parse_vertex_features(user_mapping, options.user_file_info)
+
+    item_mapping = {}
+    items_info = {}
+    if hasattr( options, "item_file_info"):
+        items_info = parse_vertex_features(item_mapping, options.item_file_info)
+
+    graph_info = convert_to_matrix_market(options.graph_file, user_mapping, item_mapping)
+
+    with open(options.graph_file + ".info", 'w') as f:
+        f.write(
+            simplejson.dumps(
+                {
+                    'num_users': len(user_mapping),
+                    'num_user_features': users_info.get('num_features', 0),
+                    'num_items': len(item_mapping),
+                    'num_item_features': items_info.get('num_features', 0),
+                    'num_edge_features': graph_info.get('num_features', 0),
+                    'num_edges': graph_info.get('num_edges', 0),
+                    'user_mapping': user_mapping,
+                    'item_mapping': item_mapping
+                }
+                )
+            )
+
diff --git a/scripts/lastFM_user_feature.py b/scripts/lastFM_user_feature.py
@@ -0,0 +1,80 @@
+from optparse import OptionParser
+import csv
+import simplejson
+
+DELIM = '\t'
+
+MULTIPLE_FEATURE_DELIM = ','
+
+month_mapping = {"Jan":1, "Feb":2, "Mar":3, "Apr":4, "May":5, "Jun":6, "Jul":7, "Aug":8, "Sep":9, "Oct":10, "Nov":11, "Dec":12}
+
+def parse_command_line():
+    parser = OptionParser(usage="python lastFM_user_feature.py -u=<user-file> -a=<age_bin_interval> -dy=<date_bin_year> -dm=<date_bin_month> -dd=<date_bin_day>"
+        + "[other optional options]")
+
+    # Information about the user file
+    parser.add_option("-u", "--user-file", action="store", type="string", dest="user_file",
+                      help="The file containing the user features")
+
+    # Information about the bin segmentation info
+    parser.add_option("-a", "--age_bin_interval", action="store", type="int", dest="age_interval",default = 5,
+                      help="The interval of an age bin")
+
+    parser.add_option("-y", "--date_bin_year", action="store", type="int", dest="year_interval", default = 0,
+                      help="The interval of date bin on year")
+
+    parser.add_option("-m", "--date_bin_month", action="store", type="int", dest="month_interval", default = 0,
+                      help="The interval of date bin on month")
+
+    parser.add_option("-d", "--date_bin_day", action="store", type="int", dest="day_interval", default = 0,
+                      help="The interval of date bin on day")
+
+    return parser.parse_args()
+
+
+def date_key_conversion(date, year_interval, month_interval, day_interval):
+    date_format = date.replace(',',' ').split()
+    year = int(date_format[2])
+    month = month_mapping[date_format[0]]
+    day = int(date_format[1])
+    if day_interval != 0:
+        key = str(year) + ' ' + str(month) + ' ' + str(day / day_interval)
+    elif month_interval != 0:
+        key = str(year) + ' ' + str(month / month_interval)
+    elif year_interval != 0:
+        key = str(year / year_interval)
+    else: #Not specified, each day an independent bin
+        key = str(year) +' ' + str(month) + ' ' + str(day)
+    return key
+
+def age_key_conversion(age, age_interval):
+    if age == '':
+        return age
+    age_numeric = int(age)
+    if age_interval != 0:
+        key = str(age_numeric / age_interval)
+    else:
+        key = age
+    return key
+
+def parse_user_features(user_feature_file, age_interval, year_interval, month_interval, day_interval):
+
+    with open(user_feature_file, 'r') as feature_file:
+        user_out_file = open(user_feature_file + "_age"+ str(age_interval)+"_"+str(year_interval)+"y"+str(month_interval)+"m"+str(day_interval)+"d"+".conv", 'w')
+        reader = csv.reader(feature_file, delimiter=DELIM)
+        for row in reader:
+
+            age_key = age_key_conversion(row[2],age_interval)
+            date_key = date_key_conversion(row[4], year_interval, month_interval, day_interval)
+            out_str = row[0] + DELIM + row[1] + DELIM + age_key + DELIM + row[3] + DELIM + date_key
+
+            #Write the out_str to the output file
+            user_out_file.write(out_str + '\n')
+
+if __name__ == "__main__":
+
+    (options, args) = parse_command_line()
+    #print options, args
+
+    graph_info = parse_user_features(options.user_file, options.age_interval, options.year_interval, options.month_interval, options.day_interval)
+
diff --git a/scripts/movielens_item_features.py b/scripts/movielens_item_features.py
@@ -0,0 +1,21 @@
+import sys
+import csv
+
+DELIM = "|"
+OUT_DELIM = '\t'
+
+if __name__ == "__main__":
+
+    with open(sys.argv[1], 'r') as user_file:
+        out_file = open(sys.argv[1] + ".processed", 'w')
+        reader = csv.reader(user_file, delimiter=DELIM)
+        for row in reader:
+            out_str = row[0] + OUT_DELIM
+
+            for i in range(5, len(row)):
+                if row[i] == '1':
+                    out_str = out_str + str(i) + ","
+            if out_str[-1] == ',':
+                out_str = out_str[:-1]
+
+            out_file.write(out_str + '\n')