From aee121b651c580bece8dc9169340ead7cb2abe89 Mon Sep 17 00:00:00 2001 From: quintinali Date: Tue, 26 Jun 2018 17:10:48 -0700 Subject: [PATCH 01/13] made ranking a new module initial work --- ranking/pom.xml | 215 ++++++++++++ .../src/main/assembly/zipSVMWithSGDModel.xml | 24 ++ .../mudrod/ranking/common/Dispatcher.java | 128 ++++++++ .../sdap/mudrod/ranking/common/Evaluator.java | 142 ++++++++ .../sdap/mudrod/ranking/common/Ranker.java | 170 ++++++++++ .../sdap/mudrod/ranking/common/Searcher.java | 265 +++++++++++++++ .../mudrod/ranking/common/package-info.java | 18 + .../offline/predict/OfflineLearner.java | 57 ++++ .../offline/train/ClickstreamImporter.java | 113 +++++++ .../offline/train/OfflineDataGenerator.java | 310 ++++++++++++++++++ .../ranking/offline/train/SparkFormatter.java | 55 ++++ .../ranking/offline/train/SparkSVM.java | 49 +++ .../offline/train/TrainingImporter.java | 91 +++++ .../ranking/offline/train/package-info.java | 18 + .../mudrod/ranking/structure/SResult.java | 182 ++++++++++ .../ranking/structure/package-info.java | 17 + ranking/src/main/resources/config.properties | 74 +++++ .../src/main/resources/elastic_mappings.json | 68 ++++ .../src/main/resources/elastic_settings.json | 36 ++ .../javaSVMWithSGDModel/data/_SUCCESS | 0 .../javaSVMWithSGDModel/data/_common_metadata | Bin 0 -> 1022 bytes .../javaSVMWithSGDModel/data/_metadata | Bin 0 -> 1757 bytes .../javaSVMWithSGDModel/metadata/_SUCCESS | 0 .../javaSVMWithSGDModel/metadata/part-00000 | 1 + ranking/src/main/resources/log4j.properties | 63 ++++ ranking/src/main/resources/log4j2.properties | 63 ++++ 26 files changed, 2159 insertions(+) create mode 100644 ranking/pom.xml create mode 100644 ranking/src/main/assembly/zipSVMWithSGDModel.xml create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Dispatcher.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Evaluator.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Searcher.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/package-info.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/predict/OfflineLearner.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/ClickstreamImporter.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/OfflineDataGenerator.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkFormatter.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkSVM.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/TrainingImporter.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/package-info.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/structure/SResult.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/structure/package-info.java create mode 100644 ranking/src/main/resources/config.properties create mode 100644 ranking/src/main/resources/elastic_mappings.json create mode 100644 ranking/src/main/resources/elastic_settings.json create mode 100644 ranking/src/main/resources/javaSVMWithSGDModel/data/_SUCCESS create mode 100644 ranking/src/main/resources/javaSVMWithSGDModel/data/_common_metadata create mode 100644 ranking/src/main/resources/javaSVMWithSGDModel/data/_metadata create mode 100644 ranking/src/main/resources/javaSVMWithSGDModel/metadata/_SUCCESS create mode 100644 ranking/src/main/resources/javaSVMWithSGDModel/metadata/part-00000 create mode 100644 ranking/src/main/resources/log4j.properties create mode 100644 ranking/src/main/resources/log4j2.properties diff --git a/ranking/pom.xml b/ranking/pom.xml new file mode 100644 index 0000000..490738b --- /dev/null +++ b/ranking/pom.xml @@ -0,0 +1,215 @@ + + + + 4.0.0 + + + org.apache.sdap.mudrod + mudrod-parent + 0.0.1-SNAPSHOT + ../ + + + mudrod-ranking + + Mudrod :: Ranking + Mudrod ranking algorithm implementation. + + + + javaSVMWithSGDModel + + + + + + org.apache.sdap.mudrod + mudrod-core + ${project.version} + + + + + + + ${basedir}/src/main/resources + true + + ${svmSgdModel.value}/** + + + + + ${project.build.directory} + + ${svmSgdModel.value}.zip + + + + + ${basedir}/../ + META-INF + + LICENSE.txt + NOTICE.txt + + + + + + + + org.codehaus.mojo + appassembler-maven-plugin + 1.10 + + + package + + assemble + + + + + flat + lib + + + org.apache.sdap.mudrod.main.MudrodEngine + + mudrod-engine + + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + 2.6 + + + zipSVMWithSGDModel + generate-resources + + single + + + false + posix + ${svmSgdModel.value} + ${project.build.directory} + + + + ${basedir}/src/main/assembly/zipSVMWithSGDModel.xml + + + + + + + generateDistribution + package + + single + + + false + posix + + + ${basedir}/src/main/assembly/bin.xml + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.0.2 + + + + test-jar + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.0.0 + + + package + + shade + + + + + + + org.apache.sdap.mudrod.main.MudrodEngine + + ${implementation.build} + + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + ${project.artifactId}-uber-${project.version} + + + + + + + + + + + + release + + + + ${basedir}/../ + + ${project.build.directory}/apidocs/META-INF + + + LICENSE.txt + NOTICE.txt + + + + + + + + diff --git a/ranking/src/main/assembly/zipSVMWithSGDModel.xml b/ranking/src/main/assembly/zipSVMWithSGDModel.xml new file mode 100644 index 0000000..6f277f7 --- /dev/null +++ b/ranking/src/main/assembly/zipSVMWithSGDModel.xml @@ -0,0 +1,24 @@ + + + + zipSVMWithSGDModel + ${svmSgdModel.value} + + zip + + + + ${basedir}/src/main/resources/${svmSgdModel.value} + . + + + \ No newline at end of file diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Dispatcher.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Dispatcher.java new file mode 100644 index 0000000..511b8c4 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Dispatcher.java @@ -0,0 +1,128 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.common; + +import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.integration.LinkageIntegration; +import org.elasticsearch.index.query.BoolQueryBuilder; +import org.elasticsearch.index.query.MatchQueryBuilder; +import org.elasticsearch.index.query.MultiMatchQueryBuilder; +import org.elasticsearch.index.query.QueryBuilders; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Properties; + +/** + * Supports ability to transform regular user query into a semantic query + */ +public class Dispatcher extends MudrodAbstract { + private static final Logger LOG = LoggerFactory.getLogger(Dispatcher.class); + + public Dispatcher(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + } + + /** + * Method of getting semantically most related terms by number + * + * @param input regular input query + * @param num the number of most related terms + * @return a map from term to similarity + */ + public Map getRelatedTerms(String input, int num) { + LinkageIntegration li = new LinkageIntegration(props, this.es, null); + Map sortedMap = li.appyMajorRule(input); + Map selected_Map = new HashMap<>(); + int count = 0; + for (Entry entry : sortedMap.entrySet()) { + if (count < num) { + selected_Map.put(entry.getKey(), entry.getValue()); + } + count++; + } + return selected_Map; + } + + /** + * Method of getting semantically most related terms by similarity threshold + * + * @param input regular input query + * @param T value of threshold, raning from 0 to 1 + * @return a map from term to similarity + */ + public Map getRelatedTermsByT(String input, double T) { + LinkageIntegration li = new LinkageIntegration(this.props, this.es, null); + Map sortedMap = li.appyMajorRule(input); + Map selected_Map = new HashMap<>(); + + for (Entry entry : sortedMap.entrySet()) { + if (entry.getValue() >= T) { + selected_Map.put(entry.getKey(), entry.getValue()); + } + } + return selected_Map; + } + + /** + * Method of creating semantic query based on Threshold + * + * @param input regular query + * @param T threshold raning from 0 to 1 + * @param query_operator query mode + * @return a multiMatch query builder + */ + public BoolQueryBuilder createSemQuery(String input, double T, String query_operator) { + Map selected_Map = getRelatedTermsByT(input, T); + selected_Map.put(input, (double) 1); + + String fieldsList[] = { "Dataset-Metadata", "Dataset-ShortName", "Dataset-LongName", + "DatasetParameter-Topic", "DatasetParameter-VariableDetail", "DatasetParameter-Category", + "DatasetParameter-Variable", "DatasetParameter-Term", + "DatasetSource-Source-LongName", "DatasetSource-Source-LongName-Full", + "DatasetSource-Source-ShortName", "DatasetSource-Source-ShortName-Full", + "DatasetSource-Sensor-LongName", "DatasetSource-Sensor-LongName-Full", "DatasetSource-Sensor-ShortName", + "DatasetSource-Sensor-ShortName-Full" }; + BoolQueryBuilder qb = new BoolQueryBuilder(); + for (Entry entry : selected_Map.entrySet()) { + if (query_operator.toLowerCase().trim().equals("phrase")) { + qb.should(QueryBuilders.multiMatchQuery(entry.getKey(), fieldsList).boost(entry.getValue().floatValue()).type(MultiMatchQueryBuilder.Type.PHRASE).tieBreaker((float) 0.5)); // when + // set + // to + // 1.0, + // it + // would + // be + // equal + // to + // "most + // fields" + // query + } else if (query_operator.toLowerCase().trim().equals("and")) { + qb.should(QueryBuilders.multiMatchQuery(entry.getKey(), fieldsList).boost(entry.getValue().floatValue()).operator(MatchQueryBuilder.DEFAULT_OPERATOR.AND).tieBreaker((float) 0.5)); + } else { + qb.should(QueryBuilders.multiMatchQuery(entry.getKey(), fieldsList).boost(entry.getValue().floatValue()).operator(MatchQueryBuilder.DEFAULT_OPERATOR.OR).tieBreaker((float) 0.5)); + } + } + + // LOG.info(qb.toString()); + return qb; + } + +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Evaluator.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Evaluator.java new file mode 100644 index 0000000..c84c537 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Evaluator.java @@ -0,0 +1,142 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.common; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Supports ability to evaluating ranking results + */ +public class Evaluator { + /** + * Method of calculating NDCG score + * + * @param list a list of integer with each integer element indicating + * the performance at its position + * @param k the number of elements needed to be included in the calculation + * @return NDCG score + */ + public double getNDCG(int[] list, int k) { + double dcg = this.getDCG(list, k); + double idcg = this.getIDCG(list, k); + double ndcg = 0.0; + if (idcg > 0.0) { + ndcg = dcg / idcg; + } + return ndcg; + } + + /** + * Method of getting the precision of a list at position K + * + * @param list a list of integer with each integer element indicating + * the performance at its position + * @param k the number of elements needed to be included in the calculation + * @return precision at K + */ + public double getPrecision(int[] list, int k) { + int size = list.length; + if (size == 0 || k == 0) { + return 0; + } + + if (k > size) { + k = size; + } + + int relDocNum = this.getRelevantDocNum(list, k); + return (double) relDocNum / (double) k; + } + + /** + * Method of getting the number of relevant element in a ranking results + * + * @param list a list of integer with each integer element indicating + * the performance at its position + * @param k the number of elements needed to be included in the calculation + * @return the number of relevant element + */ + private int getRelevantDocNum(int[] list, int k) { + int size = list.length; + if (size == 0 || k == 0) { + return 0; + } + + if (k > size) { + k = size; + } + + int relNum = 0; + for (int i = 0; i < k; i++) { + if (list[i] > 3) { // 3 refers to "OK" + relNum++; + } + } + return relNum; + } + + /** + * Method of calculating DCG score from a list of ranking results + * + * @param list a list of integer with each integer element indicating + * the performance at its position + * @param k the number of elements needed to be included in the calculation + * @return DCG score + */ + private double getDCG(int[] list, int k) { + int size = list.length; + if (size == 0 || k == 0) { + return 0.0; + } + + if (k > size) { + k = size; + } + + double dcg = list[0]; + for (int i = 1; i < k; i++) { + int rel = list[i]; + int pos = i + 1; + double relLog = Math.log(pos) / Math.log(2); + dcg += rel / relLog; + } + return dcg; + } + + /** + * Method of calculating ideal DCG score from a list of ranking results + * + * @param list a list of integer with each integer element indicating + * the performance at its position + * @param k the number of elements needed to be included in the calculation + * @return IDCG score + */ + private double getIDCG(int[] list, int k) { + Comparator comparator = new Comparator() { + @Override + public int compare(Integer o1, Integer o2) { + return o2.compareTo(o1); + } + }; + List sortlist = IntStream.of(list).boxed().collect(Collectors.toList()); + Collections.sort(sortlist, comparator); + int[] sortedArr = sortlist.stream().mapToInt(i -> i).toArray(); + return this.getDCG(sortedArr, k); + } + +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java new file mode 100644 index 0000000..be2f5c9 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java @@ -0,0 +1,170 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.common; + +import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.main.MudrodConstants; +import org.apache.sdap.mudrod.ssearch.ranking.Learner; +import org.apache.sdap.mudrod.ssearch.structure.SResult; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; + +import java.io.Serializable; +import java.text.DecimalFormat; +import java.util.*; + +/** + * Supports the ability to calculating ranking score + */ +public class Ranker extends MudrodAbstract implements Serializable { + private static final long serialVersionUID = 1L; + transient List resultList = new ArrayList<>(); + Learner le = null; + + public Ranker(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + if("1".equals(props.getProperty(MudrodConstants.RANKING_ML))) + le = new Learner(spark, props.getProperty(MudrodConstants.RANKING_MODEL)); + } + + /** + * Method of calculating mean value + * + * @param attribute the attribute name that need to be calculated on + * @param resultList an array list of result + * @return mean value + */ + private double getMean(String attribute, List resultList) { + double sum = 0.0; + for (SResult a : resultList) { + sum += (double) SResult.get(a, attribute); + } + return getNDForm(sum / resultList.size()); + } + + /** + * Method of calculating variance value + * + * @param attribute the attribute name that need to be calculated on + * @param resultList an array list of result + * @return variance value + */ + private double getVariance(String attribute, List resultList) { + double mean = getMean(attribute, resultList); + double temp = 0.0; + double val; + for (SResult a : resultList) { + val = (Double) SResult.get(a, attribute); + temp += (mean - val) * (mean - val); + } + + return getNDForm(temp / resultList.size()); + } + + /** + * Method of calculating standard variance + * + * @param attribute the attribute name that need to be calculated on + * @param resultList an array list of result + * @return standard variance + */ + private double getStdDev(String attribute, List resultList) { + return getNDForm(Math.sqrt(getVariance(attribute, resultList))); + } + + /** + * Method of calculating Z score + * + * @param val the value of an attribute + * @param mean the mean value of an attribute + * @param std the standard deviation of an attribute + * @return Z score + */ + private double getZscore(double val, double mean, double std) { + if (!equalComp(std, 0)) { + return getNDForm((val - mean) / std); + } else { + return 0; + } + } + + private boolean equalComp(double a, double b) { + return Math.abs(a - b) < 0.0001; + } + + /** + * Get the first N decimals of a double value + * + * @param d double value that needs to be processed + * @return processed double value + */ + private double getNDForm(double d) { + DecimalFormat ndForm = new DecimalFormat("#.###"); + return Double.valueOf(ndForm.format(d)); + } + + /** + * Method of ranking a list of result + * + * @param resultList result list + * @return ranked result list + */ + public List rank(List resultList) { + if(le==null) return resultList; + + for (int i = 0; i < resultList.size(); i++) { + for (int m = 0; m < SResult.rlist.length; m++) { + String att = SResult.rlist[m].split("_")[0]; + double val = SResult.get(resultList.get(i), att); + double mean = getMean(att, resultList); + double std = getStdDev(att, resultList); + double score = getZscore(val, mean, std); + String scoreId = SResult.rlist[m]; + SResult.set(resultList.get(i), scoreId, score); + } + } + + Collections.sort(resultList, new ResultComparator()); + return resultList; + } + + /** + * Method of comparing results based on final score + */ + public class ResultComparator implements Comparator { + @Override + /** + * @param o1 one item from the search result list + * @param o2 another item from the search result list + * @return 1 meaning o1>o2, 0 meaning o1=o2 + */ + public int compare(SResult o1, SResult o2) { + List instList = new ArrayList<>(); + for (String str: SResult.rlist) { + double o2Score = SResult.get(o2, str); + double o1Score = SResult.get(o1, str); + instList.add(o2Score - o1Score); + } + + double[] ins = instList.stream().mapToDouble(i -> i).toArray(); + LabeledPoint insPoint = new LabeledPoint(99.0, Vectors.dense(ins)); + int prediction = (int)le.classify(insPoint); + + return prediction; + } + } + +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Searcher.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Searcher.java new file mode 100644 index 0000000..e3f7660 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Searcher.java @@ -0,0 +1,265 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.common; + +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; + +import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.ssearch.structure.SResult; +import org.elasticsearch.action.search.SearchRequestBuilder; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.index.query.BoolQueryBuilder; +import org.elasticsearch.index.query.QueryBuilder; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.search.sort.SortOrder; + +import java.io.Serializable; +import java.text.DecimalFormat; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.regex.Pattern; + +/** + * Supports ability to performance semantic search with a given query + */ +public class Searcher extends MudrodAbstract implements Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; + DecimalFormat NDForm = new DecimalFormat("#.##"); + final Integer MAX_CHAR = 700; + + public Searcher(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + } + + /** + * Method of converting processing level string into a number + * + * @param pro processing level string + * @return processing level number + */ + public Double getProLevelNum(String pro) { + if (pro == null) { + return 1.0; + } + Double proNum; + Pattern p = Pattern.compile(".*[a-zA-Z].*"); + if (pro.matches("[0-9]{1}[a-zA-Z]{1}")) { + proNum = Double.parseDouble(pro.substring(0, 1)); + } else if (p.matcher(pro).find()) { + proNum = 1.0; + } else { + proNum = Double.parseDouble(pro); + } + + return proNum; + } + + public Double getPop(Double pop) { + if (pop > 1000) { + pop = 1000.0; + } + return pop; + } + + /** + * Main method of semantic search + * + * @param index index name in Elasticsearch + * @param type type name in Elasticsearch + * @param query regular query string + * @param queryOperator query mode- query, or, and + * @param rankOption a keyword used to dertermine the ElasticSearch SortOrder + * @return a list of search result + */ + @SuppressWarnings("unchecked") + public List searchByQuery(String index, String type, String query, String queryOperator, String rankOption) { + boolean exists = es.getClient().admin().indices().prepareExists(index).execute().actionGet().isExists(); + if (!exists) { + return new ArrayList<>(); + } + + SortOrder order = null; + String sortFiled = ""; + switch (rankOption) { + case "Rank-AllTimePopularity": + sortFiled = "Dataset-AllTimePopularity"; + order = SortOrder.DESC; + break; + case "Rank-MonthlyPopularity": + sortFiled = "Dataset-MonthlyPopularity"; + order = SortOrder.DESC; + break; + case "Rank-UserPopularity": + sortFiled = "Dataset-UserPopularity"; + order = SortOrder.DESC; + break; + case "Rank-LongName-Full": + sortFiled = "Dataset-LongName.raw"; + order = SortOrder.ASC; + break; + case "Rank-ShortName-Full": + sortFiled = "Dataset-ShortName.raw"; + order = SortOrder.ASC; + break; + case "Rank-GridSpatialResolution": + sortFiled = "Dataset-GridSpatialResolution"; + order = SortOrder.DESC; + break; + case "Rank-SatelliteSpatialResolution": + sortFiled = "Dataset-SatelliteSpatialResolution"; + order = SortOrder.DESC; + break; + case "Rank-StartTimeLong-Long": + sortFiled = "DatasetCoverage-StartTimeLong-Long"; + order = SortOrder.ASC; + break; + case "Rank-StopTimeLong-Long": + sortFiled = "DatasetCoverage-StopTimeLong-Long"; + order = SortOrder.DESC; + break; + default: + sortFiled = "Dataset-ShortName.raw"; + order = SortOrder.ASC; + break; + } + + Dispatcher dp = new Dispatcher(this.getConfig(), this.getES(), null); + BoolQueryBuilder qb = dp.createSemQuery(query, 1.0, queryOperator); + List resultList = new ArrayList<>(); + + SearchRequestBuilder builder = es.getClient().prepareSearch(index).setTypes(type).setQuery(qb).addSort(sortFiled, order).setSize(500).setTrackScores(true); + SearchResponse response = builder.execute().actionGet(); + + for (SearchHit hit : response.getHits().getHits()) { + Map result = hit.getSource(); + Double relevance = Double.valueOf(NDForm.format(hit.getScore())); + String shortName = (String) result.get("Dataset-ShortName"); + String longName = (String) result.get("Dataset-LongName"); + + ArrayList topicList = (ArrayList) result.get("DatasetParameter-Variable"); + String topic = ""; + if (null != topicList) { + topic = String.join(", ", topicList); + } + String content = (String) result.get("Dataset-Description"); + + if (!"".equals(content)) { + int maxLength = (content.length() < MAX_CHAR) ? content.length() : MAX_CHAR; + content = content.trim().substring(0, maxLength - 1) + "..."; + } + + ArrayList longdate = (ArrayList) result.get("DatasetCitation-ReleaseDateLong"); + Date date = new Date(Long.valueOf(longdate.get(0))); + SimpleDateFormat df2 = new SimpleDateFormat("MM/dd/yyyy"); + String dateText = df2.format(date); + + // start date + Long start = (Long) result.get("DatasetCoverage-StartTimeLong-Long"); + Date startDate = new Date(start); + String startDateTxt = df2.format(startDate); + + // end date + String end = (String) result.get("Dataset-DatasetCoverage-StopTimeLong"); + String endDateTxt = ""; + if ("".equals(end)) { + endDateTxt = "Present"; + } else { + Date endDate = new Date(Long.valueOf(end)); + endDateTxt = df2.format(endDate); + } + + String processingLevel = (String) result.get("Dataset-ProcessingLevel"); + Double proNum = getProLevelNum(processingLevel); + + Double userPop = getPop(((Integer) result.get("Dataset-UserPopularity")).doubleValue()); + Double allPop = getPop(((Integer) result.get("Dataset-AllTimePopularity")).doubleValue()); + Double monthPop = getPop(((Integer) result.get("Dataset-MonthlyPopularity")).doubleValue()); + + List sensors = (List) result.get("DatasetSource-Sensor-ShortName"); + + SResult re = new SResult(shortName, longName, topic, content, dateText); + + SResult.set(re, "term", relevance); + SResult.set(re, "releaseDate", Long.valueOf(longdate.get(0)).doubleValue()); + SResult.set(re, "processingLevel", processingLevel); + SResult.set(re, "processingL", proNum); + SResult.set(re, "userPop", userPop); + SResult.set(re, "allPop", allPop); + SResult.set(re, "monthPop", monthPop); + SResult.set(re, "startDate", startDateTxt); + SResult.set(re, "endDate", endDateTxt); + SResult.set(re, "sensors", String.join(", ", sensors)); + + QueryBuilder queryLabelSearch = QueryBuilders.boolQuery().must(QueryBuilders.termQuery("query", query)).must(QueryBuilders.termQuery("dataID", shortName)); + SearchResponse labelRes = es.getClient().prepareSearch(index).setTypes("trainingranking").setQuery(queryLabelSearch).setSize(5).execute().actionGet(); + String labelString = null; + for (SearchHit label : labelRes.getHits().getHits()) { + Map labelItem = label.getSource(); + labelString = (String) labelItem.get("label"); + } + SResult.set(re, "label", labelString); + resultList.add(re); + } + + return resultList; + } + + /** + * Method of semantic search to generate JSON string + * + * @param index index name in Elasticsearch + * @param type type name in Elasticsearch + * @param query regular query string + * @param queryOperator query mode- query, or, and + * @param rankOption a keyword used to dertermine the ElasticSearch SortOrder + * @param rr selected ranking method + * @return search results + */ + public String ssearch(String index, String type, String query, String queryOperator, String rankOption, Ranker rr) { + List li = searchByQuery(index, type, query, queryOperator, rankOption); + if ("Rank-SVM".equals(rankOption)) { + li = rr.rank(li); + } + Gson gson = new Gson(); + List fileList = new ArrayList<>(); + + for (SResult aLi : li) { + JsonObject file = new JsonObject(); + file.addProperty("Short Name", (String) SResult.get(aLi, "shortName")); + file.addProperty("Long Name", (String) SResult.get(aLi, "longName")); + file.addProperty("Topic", (String) SResult.get(aLi, "topic")); + file.addProperty("Description", (String) SResult.get(aLi, "description")); + file.addProperty("Release Date", (String) SResult.get(aLi, "relase_date")); + fileList.add(file); + + file.addProperty("Start/End Date", (String) SResult.get(aLi, "startDate") + " - " + (String) SResult.get(aLi, "endDate")); + file.addProperty("Processing Level", (String) SResult.get(aLi, "processingLevel")); + + file.addProperty("Sensor", (String) SResult.get(aLi, "sensors")); + } + JsonElement fileListElement = gson.toJsonTree(fileList); + + JsonObject pDResults = new JsonObject(); + pDResults.add("PDResults", fileListElement); + return pDResults.toString(); + } +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/package-info.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/package-info.java new file mode 100644 index 0000000..2c47e90 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/package-info.java @@ -0,0 +1,18 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This package includes classes for semantic search, such as click stream importer, + * query dispatcher, semantic searcher, and ranker (ranksvm, ordinal/linear regression) + */ +package org.apache.sdap.mudrod.ranking.common; diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/predict/OfflineLearner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/predict/OfflineLearner.java new file mode 100644 index 0000000..f7711b8 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/predict/OfflineLearner.java @@ -0,0 +1,57 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.offline.predict; + +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.spark.SparkContext; +import org.apache.spark.mllib.classification.SVMModel; +import org.apache.spark.mllib.regression.LabeledPoint; + +import java.io.Serializable; + +/** + * Supports the ability to importing classifier into memory + */ +public class OfflineLearner implements Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; + SVMModel model = null; + transient SparkContext sc = null; + + /** + * Constructor to load in spark SVM classifier + * + * @param classifierName classifier type + * @param skd an instance of spark driver + * @param svmSgdModel path to a trained model + */ + public OfflineLearner(SparkDriver skd, String svmSgdModel) { + sc = skd.sc.sc(); + sc.addFile(svmSgdModel, true); + model = SVMModel.load(sc, svmSgdModel); + } + + /** + * Method of classifying instance + * + * @param p the instance that needs to be classified + * @return the class id + */ + public double classify(LabeledPoint p) { + return model.predict(p.features()); + } + +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/ClickstreamImporter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/ClickstreamImporter.java new file mode 100644 index 0000000..35bba2e --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/ClickstreamImporter.java @@ -0,0 +1,113 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.offline.train; + +import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.main.MudrodConstants; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.common.xcontent.XContentBuilder; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; + +/** + * Supports ability to import click stream data into Elasticsearch + * through .csv file + */ +public class ClickstreamImporter extends MudrodAbstract { + /** + * + */ + private static final long serialVersionUID = 1L; + + public ClickstreamImporter(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + addClickStreamMapping(); + } + + /** + * Method to add Elasticsearch mapping for click stream data + */ + public void addClickStreamMapping() { + XContentBuilder Mapping; + try { + Mapping = jsonBuilder().startObject().startObject( + props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)).startObject( + "properties").startObject("query").field("type", "string").field( + "index", "not_analyzed").endObject().startObject("dataID").field( + "type", "string").field("index", "not_analyzed").endObject() + + .endObject().endObject().endObject(); + + es.getClient().admin().indices().preparePutMapping( + props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType( + props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)).setSource( + Mapping).execute().actionGet(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * Method to import click stream CSV into Elasticsearch + */ + public void importfromCSVtoES() { + es.deleteType(props.getProperty(MudrodConstants.ES_INDEX_NAME), + props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)); + es.createBulkProcessor(); + + BufferedReader br = null; + String cvsSplitBy = ","; + + try { + br = new BufferedReader(new FileReader(props.getProperty(MudrodConstants.CLICKSTREAM_PATH))); + String line = br.readLine(); + // first item needs to be skipped + String[] dataList = line.split(cvsSplitBy); + while ((line = br.readLine()) != null) { + String[] clicks = line.split(cvsSplitBy); + for (int i = 1; i < clicks.length; i++) { + if (!"0.0".equals(clicks[i])) { + IndexRequest ir = new IndexRequest(props.getProperty(MudrodConstants.ES_INDEX_NAME), + props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)) + .source(jsonBuilder().startObject().field("query", clicks[0]).field( + "dataID", dataList[i]).field("clicks", clicks[i]).endObject()); + es.getBulkProcessor().add(ir); + } + } + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } finally { + if (br != null) { + try { + br.close(); + es.destroyBulkProcessor(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/OfflineDataGenerator.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/OfflineDataGenerator.java new file mode 100644 index 0000000..9a55523 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/OfflineDataGenerator.java @@ -0,0 +1,310 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.offline.train; + +import au.com.bytecode.opencsv.CSVReader; +import au.com.bytecode.opencsv.CSVWriter; + +import java.io.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * SVMData is a program designed to create appropriate input data for the RankSVM + * algorithm that involves Pairwise Classification. Specifically, instead of working in + * the space of query-document vectors, e.g. x1, x2, x3, we transform them into a new space + * in which a pair of documents is represented as the difference between their feature vectors. + */ +public class OfflineDataGenerator { + private static String mySourceDir; + private static String myResultDir; + private static boolean isMultFiles; + + private static String[] myHeader; + private static List> myMasterList = new ArrayList<>(); + + // HashMap used for comparing evaluation classes + public static final Map map1 = new HashMap<>(); + + static { + map1.put("Excellent", 7); + map1.put("Very good", 6); + map1.put("Good", 5); + map1.put("OK", 4); + map1.put("Bad", 3); + map1.put("Very bad", 2); + map1.put("Terrible", 1); + } + + /** + * Constructor which takes in path containing one or multiple files to process. + * Also takes in argument specifying whether or not a single file needs to be processed, + * or multiple files need to be processed. + * + * @param sourceDir directory containing single file or multiple files to be processed + * @param resultDir output folder + * @param multFiles true if multiple files in directory need to be processed and false if + * only a single file needs to be processed + */ + public OfflineDataGenerator(String sourceDir, String resultDir, boolean multFiles) { + mySourceDir = sourceDir; + myResultDir = resultDir; + isMultFiles = multFiles; + } + + /** + * Responsible for invoking the processing of data file(s) and their subsequent storage + * into a user specified directory. + */ + public void process() { + parseFile(); + writeCSVfile(myMasterList); + } + + /** + * Parses the original user-specified CSV file, storing the contents for future calculations + * and formatting. + */ + public static void parseFile() { + String[][] dataArr = null; + try { + String sourceDir = mySourceDir; + + if (isMultFiles) // Case where multiple files have to be processed + { + // Iterate over files in directory + File directory = new File(sourceDir); + File[] directoryListing = directory.listFiles(); + + if (directoryListing != null) { + for (File child : directoryListing) { + CSVReader csvReader = new CSVReader(new FileReader(child)); + List list = csvReader.readAll(); + + // Store into 2D array by transforming array list to normal array + dataArr = new String[list.size()][]; + dataArr = list.toArray(dataArr); + + calculateVec(dataArr); + + csvReader.close(); + } + storeHead(dataArr); // Store the header + } + } else // Process only one file + { + File file = new File(sourceDir); + + if (file != null) { + CSVReader csvReader = new CSVReader(new FileReader(file)); + List list = csvReader.readAll(); + + // Store into 2D array by transforming array list to normal array + dataArr = new String[list.size()][]; + dataArr = list.toArray(dataArr); + + storeHead(dataArr); // Store the header + calculateVec(dataArr); + + csvReader.close(); + } + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * Performs the necessary vector calculations on each possible combination of vectors, + * also storing a value that indicates the evaluation. + * + * @param arr the parsed contents of the original CSV file + */ + public static void calculateVec(String[][] arr) { + List> listofLists = new ArrayList<>(); // Holds calculations + + int rowStart = 1; + for (int row = rowStart; row < arr.length; row++) // Start at row 1 because row 0 is heading lol + { + for (int i = 1; i < arr.length - row; i++) { + List colList = new ArrayList(); // create vector to store all values inside of a column, which is stored inside 2D vector + for (int col = 0; col < arr[0].length - 1; col++) // Columns go until the next to last column + { + // Extract double value from each cell + double x1 = Double.parseDouble(arr[row][col]); + double x2 = Double.parseDouble(arr[row + i][col]); + + // Perform calculation for each cell + double result = x1 - x2; + + // Convert this double value into string, and store inside array list + String strResult = Double.toString(result); + colList.add(strResult); + } + + // Finally, add either 1, -1, or do not add row at all when encountering evaluation value + int addEvalNum = compareEvaluation(arr[row][arr[0].length - 1], arr[row + i][arr[0].length - 1]); + if (addEvalNum == 1) { + colList.add("1"); + listofLists.add(colList); // Add this list to 2D list - row is finished now, move on + } else if (addEvalNum == -1) { + colList.add("-1"); + listofLists.add(colList); // Add this list to 2D list - row is finished now, move on + } + // Else, they are equal, do not even add this row to 2D vector + } + } + + // After all processing takes place, send to method that recreates data with equal # of 1's and -1's + List> equalizedList = equalizeList(listofLists); + myMasterList.addAll(equalizedList); + } + + /** + * Taking in two vector evaluation parameters, compares these two evaluations, returning a 1 + * if the first evaluation is greater than the second, a -1 in the case the first evaluation is + * less than the second, and a 10 in the case that the two are equal, meaning this vector will + * not be used. + * + * @param eval1 evaluation from first vector + * @param eval2 evaluation from second vector + * @return 1 if first evaluation is greater than the second, -1 if first evaluation is less than the second, and + * 10 in the case that the two are equal + */ + public static int compareEvaluation(String eval1, String eval2) { + int evalNum1 = map1.get(eval1); + int evalNum2 = map1.get(eval2); + + if (evalNum1 > evalNum2) // ">" means it is more relevant - assign a 1 + { + return 1; + } else if (evalNum1 < evalNum2) { + return -1; + } else { + return 10; // Return 10 if they are equal - signifies you cannot use the row + } + } + + /** + * After vector calculations and new evaluation values are set, produces refined output data such that + * there is an equal or close to equal number of rows containing both "1" and "-1" as the new evaluation value. + * + * @param rawList originally calculated data from the input CSV file + * @return data that has an equal distribution of evaluation values + */ + public static List> equalizeList(List> rawList) { + // Create two sets - one containing row index for +1 and the other for -1 + List pos1List = new ArrayList<>(); + List neg1List = new ArrayList<>(); + + for (int i = 0; i < rawList.size(); i++) // Iterate through all rows to get indexes + { + int evalNum = Integer.parseInt(rawList.get(i).get(rawList.get(0).size() - 1)); // Get 1 or -1 from original array list + if (evalNum == 1) { + pos1List.add(i); // Add row index that has 1 + } else if (evalNum == -1) { + neg1List.add(i); // Add row index that has -1 + } + } + + int totPosCount = pos1List.size(); // Total # of 1's + int totNegCount = neg1List.size(); // Total # of -1's + + if ((totPosCount - totNegCount) >= 1) // There are more 1's than -1's, equalize them + { + int indexOfPosList = 0; // Start getting indexes from the first index of positive index location list + while ((totPosCount - totNegCount) >= 1) // Keep going until we have acceptable amount of both +1 and -1 + { + int pos1IndexVal = pos1List.get(indexOfPosList); // Get index from previously made list of indexes + for (int col = 0; col < rawList.get(0).size(); col++) // Go through elements of indexed row, negating it to transform to -1 row + { + double d = Double.parseDouble(rawList.get(pos1IndexVal).get(col)); // Transform to double first + d = d * -1; // Negate it + String negatedValue = Double.toString(d); // Change back to String + rawList.get(pos1IndexVal).set(col, negatedValue);// Put this value back into dat row + } + + totPosCount--; // We changed a +1 row to a -1 row, decrement count of positives + totNegCount++; // Increment count of negatives + indexOfPosList++; // Get next +1 location in raw data + } + } else if ((totNegCount - totPosCount) > 1) // There are more -1's than 1's, equalize them + { + int indexOfNegList = 0; + while ((totNegCount - totPosCount) > 1) // Keep going until we have acceptable amount of both +1 and -1 + { + int neg1IndexVal = neg1List.get(indexOfNegList); // Get index from previously made list of indexes + for (int col = 0; col < rawList.get(0).size(); col++) // Go through elements of indexed row, negating it to transform to +1 row + { + double d = Double.parseDouble(rawList.get(neg1IndexVal).get(col)); // Transform to double first + d = d * -1; // Negate it + String negatedValue = Double.toString(d); // Change back to String + rawList.get(neg1IndexVal).set(col, negatedValue);// Put this value back into dat row + } + + totNegCount--; // We changed a -1 row to a +1 row, decrement count of negatives now + totPosCount++; // Increment count of positives + indexOfNegList++; // Get next -1 location in raw data + } + } else { + // Do nothing - rows are within acceptable equality bounds of plus or minus 1 + } + + return rawList; + } + + /** + * Retrieves the heading from a file to be processed so it can be written to the output file later. + * + * @param arr 2D array containing the parsed information from input file + */ + public static void storeHead(String[][] arr) { + myHeader = new String[arr[0].length]; // Reside private variable + + System.arraycopy(arr[0], 0, myHeader, 0, arr[0].length); + } + + /** + * Writes newly calculated and equally distributed vector data to user specified CSV file. + * + * @param list finalized vector data to write to user specified output file + */ + public static void writeCSVfile(List> list) { + String outputFile = myResultDir; + boolean alreadyExists = new File(outputFile).exists(); + + try { + CSVWriter csvOutput = new CSVWriter(new FileWriter(outputFile), ','); // Create new instance of CSVWriter to write to file output + + if (!alreadyExists) { + csvOutput.writeNext(myHeader); // Write the text headers first before data + + for (List aList : list) { // Iterate through all rows in 2D array + String[] temp = new String[aList.size()]; // Convert row array list in 2D array to regular string array + temp = aList.toArray(temp); + csvOutput.writeNext(temp); // Write this array to the file + } + } + + csvOutput.close(); // Close csvWriter + } catch (IOException e) { + e.printStackTrace(); + } + } + +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkFormatter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkFormatter.java new file mode 100644 index 0000000..3417066 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkFormatter.java @@ -0,0 +1,55 @@ +package org.apache.sdap.mudrod.ranking.offline.train; + +import java.io.*; +import java.text.DecimalFormat; + +public class SparkFormatter { + DecimalFormat NDForm = new DecimalFormat("#.###"); + + public SparkFormatter() { + } + + public void toSparkSVMformat(String inputCSVFileName, String outputTXTFileName) { + File file = new File(outputTXTFileName); + if (file.exists()) { + file.delete(); + } + try { + file.createNewFile(); + FileWriter fw = new FileWriter(outputTXTFileName); + BufferedWriter bw = new BufferedWriter(fw); + + BufferedReader br = new BufferedReader(new FileReader(inputCSVFileName)); + br.readLine(); + String line = br.readLine(); + while (line != null) { + String[] list = line.split(","); + String output = ""; + Double label = Double.parseDouble(list[list.length - 1].replace("\"", "")); + if (label == -1.0) { + output = "0 "; + } else if (label == 1.0) { + output = "1 "; + } + + for (int i = 0; i < list.length - 1; i++) { + int index = i + 1; + output += index + ":" + NDForm.format(Double.parseDouble(list[i].replace("\"", ""))) + " "; + } + bw.write(output + "\n"); + + line = br.readLine(); + } + br.close(); + bw.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void main(String[] args) { + SparkFormatter sf = new SparkFormatter(); + sf.toSparkSVMformat("C:/mudrodCoreTestData/rankingResults/inputDataForSVM.csv", "C:/mudrodCoreTestData/rankingResults/inputDataForSVM_spark.txt"); + } + +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkSVM.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkSVM.java new file mode 100644 index 0000000..d8a52ae --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkSVM.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.offline.train; + +import org.apache.sdap.mudrod.main.MudrodEngine; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.classification.SVMModel; +import org.apache.spark.mllib.classification.SVMWithSGD; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; + +public class SparkSVM { + + private SparkSVM() { + //public constructor + } + + public static void main(String[] args) { + MudrodEngine me = new MudrodEngine(); + + JavaSparkContext jsc = me.startSparkDriver().sc; + + String path = SparkSVM.class.getClassLoader().getResource("inputDataForSVM_spark.txt").toString(); + JavaRDD data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); + + // Run training algorithm to build the model. + int numIterations = 100; + final SVMModel model = SVMWithSGD.train(data.rdd(), numIterations); + + // Save and load model + model.save(jsc.sc(), SparkSVM.class.getClassLoader().getResource("javaSVMWithSGDModel").toString()); + + jsc.sc().stop(); + + } + +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/TrainingImporter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/TrainingImporter.java new file mode 100644 index 0000000..12e1173 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/TrainingImporter.java @@ -0,0 +1,91 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.offline.train; + +import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.main.MudrodConstants; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.query.QueryBuilders; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; + +/** + * Supports the ability to importing training set into Elasticsearch + */ +public class TrainingImporter extends MudrodAbstract { + /** + * + */ + private static final long serialVersionUID = 1L; + + public TrainingImporter(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + es.deleteAllByQuery(props.getProperty(MudrodConstants.ES_INDEX_NAME), "trainingranking", QueryBuilders.matchAllQuery()); + addMapping(); + } + + /** + * Method of adding mapping to traning set type + */ + public void addMapping() { + XContentBuilder Mapping; + try { + Mapping = jsonBuilder().startObject().startObject("trainingranking").startObject("properties").startObject("query").field("type", "string").field("index", "not_analyzed").endObject() + .startObject("dataID").field("type", "string").field("index", "not_analyzed").endObject().startObject("label").field("type", "string").field("index", "not_analyzed").endObject().endObject() + .endObject().endObject(); + + es.getClient().admin().indices().preparePutMapping(props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType("trainingranking").setSource(Mapping).execute().actionGet(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * Method of importing training set in to Elasticsearch + * + * @param dataFolder the path to the traing set + * @throws IOException IOException + */ + public void importTrainingSet(String dataFolder) throws IOException { + es.createBulkProcessor(); + + File[] files = new File(dataFolder).listFiles(); + for (File file : files) { + BufferedReader br = new BufferedReader(new FileReader(file.getAbsolutePath())); + br.readLine(); + String line = br.readLine(); + while (line != null) { + String[] list = line.split(","); + String query = file.getName().replace(".csv", ""); + if (list.length > 0) { + IndexRequest ir = new IndexRequest(props.getProperty(MudrodConstants.ES_INDEX_NAME), "trainingranking") + .source(jsonBuilder().startObject().field("query", query).field("dataID", list[0]).field("label", list[list.length - 1]).endObject()); + es.getBulkProcessor().add(ir); + } + line = br.readLine(); + } + br.close(); + } + es.destroyBulkProcessor(); + } +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/package-info.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/package-info.java new file mode 100644 index 0000000..e067620 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/package-info.java @@ -0,0 +1,18 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This package includes classes for importing training data, ML models, + * generating input data for RankSVM, and evaluating ranking results + */ +package org.apache.sdap.mudrod.ranking.offline.train; diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/structure/SResult.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/structure/SResult.java new file mode 100644 index 0000000..b1cca48 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/structure/SResult.java @@ -0,0 +1,182 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.structure; + +import java.lang.reflect.Field; + +/** + * Data structure class for search result + */ +public class SResult { + public static final String rlist[] = { "term_score", "releaseDate_score", /*"versionNum_score",*/ + "processingL_score", "allPop_score", "monthPop_score", "userPop_score"/*, "termAndv_score"*/ }; + String shortName = null; + String longName = null; + String topic = null; + String description = null; + String relase_date = null; + + public Double final_score = 0.0; + public Double term_score = 0.0; + public Double releaseDate_score = 0.0; + public Double versionNum_score = 0.0; + public Double processingL_score = 0.0; + public Double click_score = 0.0; + public Double allPop_score = 0.0; + public Double monthPop_score = 0.0; + public Double userPop_score = 0.0; + public Double termAndv_score = 0.0; + public Integer below = 0; + + public Double Dataset_LongName_score = null; + public Double Dataset_Metadata_score = null; + public Double DatasetParameter_Term_score = null; + public Double DatasetSource_Source_LongName_score = null; + public Double DatasetSource_Sensor_LongName_score = null; + + public String version = null; + public String processingLevel = null; + public String latency = null; + public String stopDateLong = null; + public String stopDateFormat = null; + public Double spatialR_Sat = null; + public Double spatialR_Grid = null; + public String temporalR = null; + + public Double releaseDate = null; + public Double click = null; + public Double term = null; + public Double versionNum = null; + public Double processingL = null; + public Double allPop = null; + public Double monthPop = null; + public Double userPop = null; + public Double termAndv = null; + + public Double Dataset_LongName = null; + public Double Dataset_Metadata = null; + public Double DatasetParameter_Term = null; + public Double DatasetSource_Source_LongName = null; + public Double DatasetSource_Sensor_LongName = null; + + public Double prediction = 0.0; + public String label = null; + + public String startDate; + public String endDate; + public String sensors; + + /** + * @param shortName short name of dataset + * @param longName long name of dataset + * @param topic topic of dataset + * @param description description of dataset + * @param date release date of dataset + */ + public SResult(String shortName, String longName, String topic, String description, String date) { + this.shortName = shortName; + this.longName = longName; + this.topic = topic; + this.description = description; + this.relase_date = date; + } + + public SResult(SResult sr) { + for (String aRlist : rlist) { + set(this, aRlist, get(sr, aRlist)); + } + } + + /** + * Method of getting export header + * + * @param delimiter the delimiter used to separate strings + * @return header + */ + public static String getHeader(String delimiter) { + String str = ""; + for (String aRlist : rlist) { + str += aRlist + delimiter; + } + str = str + "label" + "\n"; + return "ShortName" + delimiter + "below" + delimiter + str; + } + + /** + * Method of get a search results as string + * + * @param delimiter the delimiter used to separate strings + * @return search result as string + */ + public String toString(String delimiter) { + String str = ""; + for (String aRlist : rlist) { + double score = get(this, aRlist); + str += score + delimiter; + } + str = str + label + "\n"; + return shortName + delimiter + below + delimiter + str; + } + + /** + * Generic setter method + * + * @param object instance of SResult + * @param fieldName field name that needs to be set on + * @param fieldValue field value that needs to be set to + * @return 1 means success, and 0 otherwise + */ + public static boolean set(Object object, String fieldName, Object fieldValue) { + Class clazz = object.getClass(); + while (clazz != null) { + try { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + field.set(object, fieldValue); + return true; + } catch (NoSuchFieldException e) { + clazz = clazz.getSuperclass(); + } catch (Exception e) { + throw new IllegalStateException(e); + } + } + return false; + } + + /** + * Generic getter method + * + * @param object instance of SResult + * @param fieldName field name of search result + * @param data type + * @return the value of the filed in the object + */ + @SuppressWarnings("unchecked") + public static V get(Object object, String fieldName) { + Class clazz = object.getClass(); + while (clazz != null) { + try { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + return (V) field.get(object); + } catch (NoSuchFieldException e) { + clazz = clazz.getSuperclass(); + } catch (Exception e) { + throw new IllegalStateException(e); + } + } + return null; + } + +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/structure/package-info.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/structure/package-info.java new file mode 100644 index 0000000..496caee --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/structure/package-info.java @@ -0,0 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This package includes data structure needed for ranking process + */ +package org.apache.sdap.mudrod.ranking.structure; diff --git a/ranking/src/main/resources/config.properties b/ranking/src/main/resources/config.properties new file mode 100644 index 0000000..4c8991e --- /dev/null +++ b/ranking/src/main/resources/config.properties @@ -0,0 +1,74 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Database configuration +mudrod.cluster.name=MudrodES +mudrod.es.transport.tcp.port = 9300 +mudrod.es.unicast.hosts = 127.0.0.1 +mudrod.es.http.port = 9200 +mudrod.es.index = mudrod + +# Spark related +# Log processing type. Possible values include 'sequential' or 'parallel' +mudrod.processing.type = parallel +mudrod.spark.app.name = MudrodSparkApp +mudrod.spark.master = local[4] +mudrod.spark.optimize = repartition + +# Web log processing configuration +# index name has to be all lowercase +mudrod.log.index = log +mudrod.ftp.prefix = FTP. +mudrod.http.prefix = WWW. +mudrod.base.url = http://podaac.jpl.nasa.gov +mudrod.black.request.list = .js, .css, .jpg, .png, .ico, image_captcha, autocomplete, .gif, /alldata/, /api/, get / http/1.1, .jpeg, /ws/ +mudrod.black.agent.list = crawler, googlebot, bingbot, slurp, yacybot, rogerbot, yandexbot, -, apache-httpclient, java, curl +mudrod.search.freq = 100 +mudrod.view.freq = 200 +mudrod.download.freq = 100 +mudrod.request.rate = 30 +mudrod.session.port = 8080 +mudrod.session.url = /mudrod-service/session.html +mudrod.request.time.gap = 600 +mudrod.view.url.marker = /dataset/ +mudrod.search.url.marker = /datasetlist? +# In order to better parse a URL (getting searching keyword, etc.), please consider custimize +# org.apache.sdap.mudrod.weblog.structure.RequestUrl - GetSearchInfo, getFilterInfo + +# User search history +mudrod.query.min = 0 +mudrod.user.history.weight = 2 + +# clickstream +mudrod.download.weight = 3 +mudrod.clickstream.svd.d = 50 +mudrod.clickstream.weight = 2 + +# metadata +mudrod.metadata.download = 0 +mudrod.metadata.download.url = https://podaac.jpl.nasa.gov/api/dataset?startIndex=$startIndex&entries=10&sortField=Dataset-AllTimePopularity&sortOrder=asc&id=&value=&search= +mudrod.metadata.svd.d = 50 +mudrod.metadata.url = null +mudrod.metadata.weight = 1 +mudrod.metadata.type = RawMetadata + +# ranking, ${svmSgdModel.value} is resolved at build time. See the property in core/pom.xml for the value +mudrod.ranking.machine.learning = 1 +mudrod.ranking.model = ${svmSgdModel.value}.zip + +# recommendation +mudrod.metadata.id = Dataset-ShortName +mudrod.metadata.semantic.fields = DatasetParameter-Term,DatasetParameter-Variable,Dataset-ExtractTerm + +# ontology service implementation. Possible values include EsipPortal - EsipPortalOntology EsipCOR - EsipCOROntology Local - org.apache.sdap.mudrod.ontology.process.Local +mudrod.ontology.implementation = Local +mudrod.ontology.weight = 2 diff --git a/ranking/src/main/resources/elastic_mappings.json b/ranking/src/main/resources/elastic_mappings.json new file mode 100644 index 0000000..6a0494f --- /dev/null +++ b/ranking/src/main/resources/elastic_mappings.json @@ -0,0 +1,68 @@ +{ + "_default_": { + "properties": { + "keywords": { + "type": "text", + "analyzer": "csv", + "fielddata": true + }, + "views": { + "type": "string", + "analyzer": "csv" + }, + "downloads": { + "type": "string", + "analyzer": "csv" + }, + "RequestUrl": { + "type": "string", + "include_in_all": false, + "index": "no" + }, + "IP": { + "type": "keyword", + "index": "not_analyzed" + }, + "Browser": { + "type": "string", + "include_in_all": false, + "index": "no" + }, + "SessionURL": { + "type": "string", + "include_in_all": false, + "index": "no" + }, + "Referer": { + "type": "string", + "index": "not_analyzed" + }, + "SessionID": { + "type": "string", + "index": "not_analyzed" + }, + "Response": { + "type": "string", + "include_in_all": false, + "index": "no" + }, + "Request": { + "type": "string", + "include_in_all": false, + "index": "no" + }, + "Coordinates": { + "type": "geo_point", + "include_in_all": false, + "index": "no" + }, + "LogType": { + "type": "string", + "index": "not_analyzed" + }, + "Dataset-Metadata": { + "type": "completion" + } + } + } +} \ No newline at end of file diff --git a/ranking/src/main/resources/elastic_settings.json b/ranking/src/main/resources/elastic_settings.json new file mode 100644 index 0000000..05f8664 --- /dev/null +++ b/ranking/src/main/resources/elastic_settings.json @@ -0,0 +1,36 @@ +{ + "index": { + "number_of_replicas": 0, + "refresh_interval": "-1", + "number_of_shards": "5", + "translog.flush_threshold_size": "1g", + "translog.sync_interval": "30s", + "warmer.enabled": "false" + }, + "analysis": { + "filter": { + "cody_stop": { + "type": "stop", + "stopwords": "_english_" + }, + "cody_stemmer": { + "type": "stemmer", + "language": "light_english" + } + }, + "analyzer": { + "cody": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "cody_stop", + "cody_stemmer" + ] + }, + "csv": { + "type": "pattern", + "pattern": "," + } + } + } +} \ No newline at end of file diff --git a/ranking/src/main/resources/javaSVMWithSGDModel/data/_SUCCESS b/ranking/src/main/resources/javaSVMWithSGDModel/data/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/ranking/src/main/resources/javaSVMWithSGDModel/data/_common_metadata b/ranking/src/main/resources/javaSVMWithSGDModel/data/_common_metadata new file mode 100644 index 0000000000000000000000000000000000000000..cafbf1b4cd44b0290ced70a047b59064c2190f9d GIT binary patch literal 1022 zcmb7@T}vB56o!Xws#Uz*vVl@$Ku}1R+U&-VUKPFYN~GA{h?J9=lVs@ZhcmO4ZOA|D zo&Ti2pflNZ8c~B@Ewgjp^FHTe&cA#;41?~U(;Z!c`uRh%8D*H2a)(zp4p%(pO3FuANfG+XwDCxL1XwC9UN*GI9$ONRu9=)rcj^>L+ z&sXVY=1Zim%TrYM^tMA!%iBr&44xO1g4+IWr`=SulAjg10g>x7%h@t6R;kyzHf!@- zQ(jcA5q4xR_q?*bTIuClD!-y0jC$+;<5-)*Z}m%4TrEiSxn1@TncCeSS3>gL{wJ0U n21$RIqzFSe`k1nQ$`fS62^*vdv){?^aKI19-&cg3+n4_jb&^Qd literal 0 HcmV?d00001 diff --git a/ranking/src/main/resources/javaSVMWithSGDModel/data/_metadata b/ranking/src/main/resources/javaSVMWithSGDModel/data/_metadata new file mode 100644 index 0000000000000000000000000000000000000000..6bfec98c70dc1104bbcb3a5b730a683091c8921a GIT binary patch literal 1757 zcmb`I%WB&|6o$t_vE7iZgaoF9ftUm$;ftgwsiBl+kyQyL&8CEQERAiEvFvC@4NfqH z?AmAOv!q)uN}+VwRh@U}s*BFZvf{L{X-EXV$U5hI|GAC#pS`RqlKy3Hh4xW&{DyiR z+(k+r*pgN_#X-Bns8R%qv<5Vp4f=TPvAB^mnFeP#u?j*z@Gw;*;U#M_2`E#fiAw5BC|E+_O%EX{Segph|j4Ct8DxWkzIF#AV=0 z#YMPMH=L$ZHLUG=)o7yHwo%)vS1cdf$aXEWeP(i}lNd9#SmG($`6AHQr#U9MDY7c5 zhw53m*wEx006b;!uJRYSC-Ne93saOZVY=$u@XA|4Z zC-#MdM-b)q@E>fKkf3RjAVj2X9%7I6qF1{I+@n8up8E6$6NPh4x%7iXUr6EcXH_qxPEN)Dh5Ef%6oR+x?#v@ztC?CZRF$2;nJ}24 zuiLROL^k|)4RJIC+LQFGHoDR3dNU4)zgn+(mg87;*RnBk(N@Lw>b7rT&uw{*ZF&5& R Date: Thu, 28 Jun 2018 13:33:46 -0700 Subject: [PATCH 02/13] =?UTF-8?q?refactor=20ranking=20module=EF=BC=9Aadd?= =?UTF-8?q?=20ranksvm=20model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../sdap/mudrod/ssearch/Dispatcher.java | 128 -------- .../apache/sdap/mudrod/ssearch/Ranker.java | 170 ---------- .../apache/sdap/mudrod/ssearch/Searcher.java | 265 --------------- .../sdap/mudrod/ssearch/package-info.java | 18 - .../mudrod/ssearch/ranking/Evaluator.java | 142 -------- .../mudrod/ssearch/structure/SResult.java | 182 ---------- .../ssearch/structure/package-info.java | 17 - .../weblog/process/ClickStreamAnalyzer.java | 5 +- .../javaSVMWithSGDModel/data/_SUCCESS | 0 .../javaSVMWithSGDModel/data/_common_metadata | Bin 1022 -> 0 bytes .../javaSVMWithSGDModel/data/_metadata | Bin 1757 -> 0 bytes .../javaSVMWithSGDModel/metadata/_SUCCESS | 0 .../javaSVMWithSGDModel/metadata/part-00000 | 1 - pom.xml | 1 + .../sdap/mudrod/ranking/common/Learner.java | 84 +++++ .../mudrod/ranking/common/LearnerFactory.java | 43 ++- .../sdap/mudrod/ranking/common/Ranker.java | 14 +- .../sdap/mudrod/ranking/common/Searcher.java | 2 +- .../ranking/dlrank}/SparkFormatter.java | 2 +- .../sdap/mudrod/ranking/dlrank}/SparkSVM.java | 2 +- .../offline/predict/OfflineLearner.java | 57 ---- .../offline/train/ClickstreamImporter.java | 113 ------- .../offline/train/OfflineDataGenerator.java | 310 ------------------ .../offline/train/TrainingImporter.java | 91 ----- .../ranking/offline/train/package-info.java | 18 - .../mudrod/ranking/ranksvm/SVMLearner.java | 120 +++++++ .../train => ranksvm}/SparkFormatter.java | 4 +- .../{offline/train => ranksvm}/SparkSVM.java | 4 +- .../traindata}/ClickstreamImporter.java | 2 +- .../mudrod/ranking/traindata/ExpertData.java | 30 +- .../ranking/traindata}/TrainingImporter.java | 2 +- .../ranking/traindata}/package-info.java | 2 +- ranking/src/main/resources/config.properties | 74 ----- .../src/main/resources/elastic_mappings.json | 68 ---- .../src/main/resources/elastic_settings.json | 36 -- ...e03-6b61-4931-ba29-27304de5a584.gz.parquet | Bin ranking/src/main/resources/log4j.properties | 63 ---- ranking/src/main/resources/log4j2.properties | 63 ---- service/pom.xml | 7 + .../services/MudrodContextListener.java | 4 +- .../search/SearchMetadataResource.java | 4 +- 41 files changed, 268 insertions(+), 1880 deletions(-) delete mode 100644 core/src/main/java/org/apache/sdap/mudrod/ssearch/Dispatcher.java delete mode 100644 core/src/main/java/org/apache/sdap/mudrod/ssearch/Ranker.java delete mode 100644 core/src/main/java/org/apache/sdap/mudrod/ssearch/Searcher.java delete mode 100644 core/src/main/java/org/apache/sdap/mudrod/ssearch/package-info.java delete mode 100644 core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/Evaluator.java delete mode 100644 core/src/main/java/org/apache/sdap/mudrod/ssearch/structure/SResult.java delete mode 100644 core/src/main/java/org/apache/sdap/mudrod/ssearch/structure/package-info.java delete mode 100644 core/src/main/resources/javaSVMWithSGDModel/data/_SUCCESS delete mode 100644 core/src/main/resources/javaSVMWithSGDModel/data/_common_metadata delete mode 100644 core/src/main/resources/javaSVMWithSGDModel/data/_metadata delete mode 100644 core/src/main/resources/javaSVMWithSGDModel/metadata/_SUCCESS delete mode 100644 core/src/main/resources/javaSVMWithSGDModel/metadata/part-00000 create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java rename core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/Learner.java => ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java (57%) rename {core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking => ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank}/SparkFormatter.java (97%) rename {core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking => ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank}/SparkSVM.java (97%) delete mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/predict/OfflineLearner.java delete mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/ClickstreamImporter.java delete mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/OfflineDataGenerator.java delete mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/TrainingImporter.java delete mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/package-info.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java rename ranking/src/main/java/org/apache/sdap/mudrod/ranking/{offline/train => ranksvm}/SparkFormatter.java (96%) rename ranking/src/main/java/org/apache/sdap/mudrod/ranking/{offline/train => ranksvm}/SparkSVM.java (96%) rename {core/src/main/java/org/apache/sdap/mudrod/ssearch => ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata}/ClickstreamImporter.java (98%) rename core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/DataGenerator.java => ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertData.java (93%) rename {core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking => ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata}/TrainingImporter.java (98%) rename {core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking => ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata}/package-info.java (93%) delete mode 100644 ranking/src/main/resources/config.properties delete mode 100644 ranking/src/main/resources/elastic_mappings.json delete mode 100644 ranking/src/main/resources/elastic_settings.json rename {core => ranking}/src/main/resources/javaSVMWithSGDModel/data/part-r-00000-e008ae03-6b61-4931-ba29-27304de5a584.gz.parquet (100%) delete mode 100644 ranking/src/main/resources/log4j.properties delete mode 100644 ranking/src/main/resources/log4j2.properties diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/Dispatcher.java b/core/src/main/java/org/apache/sdap/mudrod/ssearch/Dispatcher.java deleted file mode 100644 index 611c76b..0000000 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/Dispatcher.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ssearch; - -import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; -import org.apache.sdap.mudrod.driver.ESDriver; -import org.apache.sdap.mudrod.driver.SparkDriver; -import org.apache.sdap.mudrod.integration.LinkageIntegration; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.MatchQueryBuilder; -import org.elasticsearch.index.query.MultiMatchQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.HashMap; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Properties; - -/** - * Supports ability to transform regular user query into a semantic query - */ -public class Dispatcher extends MudrodAbstract { - private static final Logger LOG = LoggerFactory.getLogger(Dispatcher.class); - - public Dispatcher(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - /** - * Method of getting semantically most related terms by number - * - * @param input regular input query - * @param num the number of most related terms - * @return a map from term to similarity - */ - public Map getRelatedTerms(String input, int num) { - LinkageIntegration li = new LinkageIntegration(props, this.es, null); - Map sortedMap = li.appyMajorRule(input); - Map selected_Map = new HashMap<>(); - int count = 0; - for (Entry entry : sortedMap.entrySet()) { - if (count < num) { - selected_Map.put(entry.getKey(), entry.getValue()); - } - count++; - } - return selected_Map; - } - - /** - * Method of getting semantically most related terms by similarity threshold - * - * @param input regular input query - * @param T value of threshold, raning from 0 to 1 - * @return a map from term to similarity - */ - public Map getRelatedTermsByT(String input, double T) { - LinkageIntegration li = new LinkageIntegration(this.props, this.es, null); - Map sortedMap = li.appyMajorRule(input); - Map selected_Map = new HashMap<>(); - - for (Entry entry : sortedMap.entrySet()) { - if (entry.getValue() >= T) { - selected_Map.put(entry.getKey(), entry.getValue()); - } - } - return selected_Map; - } - - /** - * Method of creating semantic query based on Threshold - * - * @param input regular query - * @param T threshold raning from 0 to 1 - * @param query_operator query mode - * @return a multiMatch query builder - */ - public BoolQueryBuilder createSemQuery(String input, double T, String query_operator) { - Map selected_Map = getRelatedTermsByT(input, T); - selected_Map.put(input, (double) 1); - - String fieldsList[] = { "Dataset-Metadata", "Dataset-ShortName", "Dataset-LongName", - "DatasetParameter-Topic", "DatasetParameter-VariableDetail", "DatasetParameter-Category", - "DatasetParameter-Variable", "DatasetParameter-Term", - "DatasetSource-Source-LongName", "DatasetSource-Source-LongName-Full", - "DatasetSource-Source-ShortName", "DatasetSource-Source-ShortName-Full", - "DatasetSource-Sensor-LongName", "DatasetSource-Sensor-LongName-Full", "DatasetSource-Sensor-ShortName", - "DatasetSource-Sensor-ShortName-Full" }; - BoolQueryBuilder qb = new BoolQueryBuilder(); - for (Entry entry : selected_Map.entrySet()) { - if (query_operator.toLowerCase().trim().equals("phrase")) { - qb.should(QueryBuilders.multiMatchQuery(entry.getKey(), fieldsList).boost(entry.getValue().floatValue()).type(MultiMatchQueryBuilder.Type.PHRASE).tieBreaker((float) 0.5)); // when - // set - // to - // 1.0, - // it - // would - // be - // equal - // to - // "most - // fields" - // query - } else if (query_operator.toLowerCase().trim().equals("and")) { - qb.should(QueryBuilders.multiMatchQuery(entry.getKey(), fieldsList).boost(entry.getValue().floatValue()).operator(MatchQueryBuilder.DEFAULT_OPERATOR.AND).tieBreaker((float) 0.5)); - } else { - qb.should(QueryBuilders.multiMatchQuery(entry.getKey(), fieldsList).boost(entry.getValue().floatValue()).operator(MatchQueryBuilder.DEFAULT_OPERATOR.OR).tieBreaker((float) 0.5)); - } - } - - // LOG.info(qb.toString()); - return qb; - } - -} diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/Ranker.java b/core/src/main/java/org/apache/sdap/mudrod/ssearch/Ranker.java deleted file mode 100644 index af7e6a9..0000000 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/Ranker.java +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ssearch; - -import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; -import org.apache.sdap.mudrod.driver.ESDriver; -import org.apache.sdap.mudrod.driver.SparkDriver; -import org.apache.sdap.mudrod.main.MudrodConstants; -import org.apache.sdap.mudrod.ssearch.ranking.Learner; -import org.apache.sdap.mudrod.ssearch.structure.SResult; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; - -import java.io.Serializable; -import java.text.DecimalFormat; -import java.util.*; - -/** - * Supports the ability to calculating ranking score - */ -public class Ranker extends MudrodAbstract implements Serializable { - private static final long serialVersionUID = 1L; - transient List resultList = new ArrayList<>(); - Learner le = null; - - public Ranker(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - if("1".equals(props.getProperty(MudrodConstants.RANKING_ML))) - le = new Learner(spark, props.getProperty(MudrodConstants.RANKING_MODEL)); - } - - /** - * Method of calculating mean value - * - * @param attribute the attribute name that need to be calculated on - * @param resultList an array list of result - * @return mean value - */ - private double getMean(String attribute, List resultList) { - double sum = 0.0; - for (SResult a : resultList) { - sum += (double) SResult.get(a, attribute); - } - return getNDForm(sum / resultList.size()); - } - - /** - * Method of calculating variance value - * - * @param attribute the attribute name that need to be calculated on - * @param resultList an array list of result - * @return variance value - */ - private double getVariance(String attribute, List resultList) { - double mean = getMean(attribute, resultList); - double temp = 0.0; - double val; - for (SResult a : resultList) { - val = (Double) SResult.get(a, attribute); - temp += (mean - val) * (mean - val); - } - - return getNDForm(temp / resultList.size()); - } - - /** - * Method of calculating standard variance - * - * @param attribute the attribute name that need to be calculated on - * @param resultList an array list of result - * @return standard variance - */ - private double getStdDev(String attribute, List resultList) { - return getNDForm(Math.sqrt(getVariance(attribute, resultList))); - } - - /** - * Method of calculating Z score - * - * @param val the value of an attribute - * @param mean the mean value of an attribute - * @param std the standard deviation of an attribute - * @return Z score - */ - private double getZscore(double val, double mean, double std) { - if (!equalComp(std, 0)) { - return getNDForm((val - mean) / std); - } else { - return 0; - } - } - - private boolean equalComp(double a, double b) { - return Math.abs(a - b) < 0.0001; - } - - /** - * Get the first N decimals of a double value - * - * @param d double value that needs to be processed - * @return processed double value - */ - private double getNDForm(double d) { - DecimalFormat ndForm = new DecimalFormat("#.###"); - return Double.valueOf(ndForm.format(d)); - } - - /** - * Method of ranking a list of result - * - * @param resultList result list - * @return ranked result list - */ - public List rank(List resultList) { - if(le==null) return resultList; - - for (int i = 0; i < resultList.size(); i++) { - for (int m = 0; m < SResult.rlist.length; m++) { - String att = SResult.rlist[m].split("_")[0]; - double val = SResult.get(resultList.get(i), att); - double mean = getMean(att, resultList); - double std = getStdDev(att, resultList); - double score = getZscore(val, mean, std); - String scoreId = SResult.rlist[m]; - SResult.set(resultList.get(i), scoreId, score); - } - } - - Collections.sort(resultList, new ResultComparator()); - return resultList; - } - - /** - * Method of comparing results based on final score - */ - public class ResultComparator implements Comparator { - @Override - /** - * @param o1 one item from the search result list - * @param o2 another item from the search result list - * @return 1 meaning o1>o2, 0 meaning o1=o2 - */ - public int compare(SResult o1, SResult o2) { - List instList = new ArrayList<>(); - for (String str: SResult.rlist) { - double o2Score = SResult.get(o2, str); - double o1Score = SResult.get(o1, str); - instList.add(o2Score - o1Score); - } - - double[] ins = instList.stream().mapToDouble(i -> i).toArray(); - LabeledPoint insPoint = new LabeledPoint(99.0, Vectors.dense(ins)); - int prediction = (int)le.classify(insPoint); - - return prediction; - } - } - -} diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/Searcher.java b/core/src/main/java/org/apache/sdap/mudrod/ssearch/Searcher.java deleted file mode 100644 index a4fe686..0000000 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/Searcher.java +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ssearch; - -import com.google.gson.Gson; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; - -import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; -import org.apache.sdap.mudrod.driver.ESDriver; -import org.apache.sdap.mudrod.driver.SparkDriver; -import org.apache.sdap.mudrod.ssearch.structure.SResult; -import org.elasticsearch.action.search.SearchRequestBuilder; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.sort.SortOrder; - -import java.io.Serializable; -import java.text.DecimalFormat; -import java.text.SimpleDateFormat; -import java.util.*; -import java.util.regex.Pattern; - -/** - * Supports ability to performance semantic search with a given query - */ -public class Searcher extends MudrodAbstract implements Serializable { - /** - * - */ - private static final long serialVersionUID = 1L; - DecimalFormat NDForm = new DecimalFormat("#.##"); - final Integer MAX_CHAR = 700; - - public Searcher(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - /** - * Method of converting processing level string into a number - * - * @param pro processing level string - * @return processing level number - */ - public Double getProLevelNum(String pro) { - if (pro == null) { - return 1.0; - } - Double proNum; - Pattern p = Pattern.compile(".*[a-zA-Z].*"); - if (pro.matches("[0-9]{1}[a-zA-Z]{1}")) { - proNum = Double.parseDouble(pro.substring(0, 1)); - } else if (p.matcher(pro).find()) { - proNum = 1.0; - } else { - proNum = Double.parseDouble(pro); - } - - return proNum; - } - - public Double getPop(Double pop) { - if (pop > 1000) { - pop = 1000.0; - } - return pop; - } - - /** - * Main method of semantic search - * - * @param index index name in Elasticsearch - * @param type type name in Elasticsearch - * @param query regular query string - * @param queryOperator query mode- query, or, and - * @param rankOption a keyword used to dertermine the ElasticSearch SortOrder - * @return a list of search result - */ - @SuppressWarnings("unchecked") - public List searchByQuery(String index, String type, String query, String queryOperator, String rankOption) { - boolean exists = es.getClient().admin().indices().prepareExists(index).execute().actionGet().isExists(); - if (!exists) { - return new ArrayList<>(); - } - - SortOrder order = null; - String sortFiled = ""; - switch (rankOption) { - case "Rank-AllTimePopularity": - sortFiled = "Dataset-AllTimePopularity"; - order = SortOrder.DESC; - break; - case "Rank-MonthlyPopularity": - sortFiled = "Dataset-MonthlyPopularity"; - order = SortOrder.DESC; - break; - case "Rank-UserPopularity": - sortFiled = "Dataset-UserPopularity"; - order = SortOrder.DESC; - break; - case "Rank-LongName-Full": - sortFiled = "Dataset-LongName.raw"; - order = SortOrder.ASC; - break; - case "Rank-ShortName-Full": - sortFiled = "Dataset-ShortName.raw"; - order = SortOrder.ASC; - break; - case "Rank-GridSpatialResolution": - sortFiled = "Dataset-GridSpatialResolution"; - order = SortOrder.DESC; - break; - case "Rank-SatelliteSpatialResolution": - sortFiled = "Dataset-SatelliteSpatialResolution"; - order = SortOrder.DESC; - break; - case "Rank-StartTimeLong-Long": - sortFiled = "DatasetCoverage-StartTimeLong-Long"; - order = SortOrder.ASC; - break; - case "Rank-StopTimeLong-Long": - sortFiled = "DatasetCoverage-StopTimeLong-Long"; - order = SortOrder.DESC; - break; - default: - sortFiled = "Dataset-ShortName.raw"; - order = SortOrder.ASC; - break; - } - - Dispatcher dp = new Dispatcher(this.getConfig(), this.getES(), null); - BoolQueryBuilder qb = dp.createSemQuery(query, 1.0, queryOperator); - List resultList = new ArrayList<>(); - - SearchRequestBuilder builder = es.getClient().prepareSearch(index).setTypes(type).setQuery(qb).addSort(sortFiled, order).setSize(500).setTrackScores(true); - SearchResponse response = builder.execute().actionGet(); - - for (SearchHit hit : response.getHits().getHits()) { - Map result = hit.getSource(); - Double relevance = Double.valueOf(NDForm.format(hit.getScore())); - String shortName = (String) result.get("Dataset-ShortName"); - String longName = (String) result.get("Dataset-LongName"); - - ArrayList topicList = (ArrayList) result.get("DatasetParameter-Variable"); - String topic = ""; - if (null != topicList) { - topic = String.join(", ", topicList); - } - String content = (String) result.get("Dataset-Description"); - - if (!"".equals(content)) { - int maxLength = (content.length() < MAX_CHAR) ? content.length() : MAX_CHAR; - content = content.trim().substring(0, maxLength - 1) + "..."; - } - - ArrayList longdate = (ArrayList) result.get("DatasetCitation-ReleaseDateLong"); - Date date = new Date(Long.valueOf(longdate.get(0))); - SimpleDateFormat df2 = new SimpleDateFormat("MM/dd/yyyy"); - String dateText = df2.format(date); - - // start date - Long start = (Long) result.get("DatasetCoverage-StartTimeLong-Long"); - Date startDate = new Date(start); - String startDateTxt = df2.format(startDate); - - // end date - String end = (String) result.get("Dataset-DatasetCoverage-StopTimeLong"); - String endDateTxt = ""; - if ("".equals(end)) { - endDateTxt = "Present"; - } else { - Date endDate = new Date(Long.valueOf(end)); - endDateTxt = df2.format(endDate); - } - - String processingLevel = (String) result.get("Dataset-ProcessingLevel"); - Double proNum = getProLevelNum(processingLevel); - - Double userPop = getPop(((Integer) result.get("Dataset-UserPopularity")).doubleValue()); - Double allPop = getPop(((Integer) result.get("Dataset-AllTimePopularity")).doubleValue()); - Double monthPop = getPop(((Integer) result.get("Dataset-MonthlyPopularity")).doubleValue()); - - List sensors = (List) result.get("DatasetSource-Sensor-ShortName"); - - SResult re = new SResult(shortName, longName, topic, content, dateText); - - SResult.set(re, "term", relevance); - SResult.set(re, "releaseDate", Long.valueOf(longdate.get(0)).doubleValue()); - SResult.set(re, "processingLevel", processingLevel); - SResult.set(re, "processingL", proNum); - SResult.set(re, "userPop", userPop); - SResult.set(re, "allPop", allPop); - SResult.set(re, "monthPop", monthPop); - SResult.set(re, "startDate", startDateTxt); - SResult.set(re, "endDate", endDateTxt); - SResult.set(re, "sensors", String.join(", ", sensors)); - - QueryBuilder queryLabelSearch = QueryBuilders.boolQuery().must(QueryBuilders.termQuery("query", query)).must(QueryBuilders.termQuery("dataID", shortName)); - SearchResponse labelRes = es.getClient().prepareSearch(index).setTypes("trainingranking").setQuery(queryLabelSearch).setSize(5).execute().actionGet(); - String labelString = null; - for (SearchHit label : labelRes.getHits().getHits()) { - Map labelItem = label.getSource(); - labelString = (String) labelItem.get("label"); - } - SResult.set(re, "label", labelString); - resultList.add(re); - } - - return resultList; - } - - /** - * Method of semantic search to generate JSON string - * - * @param index index name in Elasticsearch - * @param type type name in Elasticsearch - * @param query regular query string - * @param queryOperator query mode- query, or, and - * @param rankOption a keyword used to dertermine the ElasticSearch SortOrder - * @param rr selected ranking method - * @return search results - */ - public String ssearch(String index, String type, String query, String queryOperator, String rankOption, Ranker rr) { - List li = searchByQuery(index, type, query, queryOperator, rankOption); - if ("Rank-SVM".equals(rankOption)) { - li = rr.rank(li); - } - Gson gson = new Gson(); - List fileList = new ArrayList<>(); - - for (SResult aLi : li) { - JsonObject file = new JsonObject(); - file.addProperty("Short Name", (String) SResult.get(aLi, "shortName")); - file.addProperty("Long Name", (String) SResult.get(aLi, "longName")); - file.addProperty("Topic", (String) SResult.get(aLi, "topic")); - file.addProperty("Description", (String) SResult.get(aLi, "description")); - file.addProperty("Release Date", (String) SResult.get(aLi, "relase_date")); - fileList.add(file); - - file.addProperty("Start/End Date", (String) SResult.get(aLi, "startDate") + " - " + (String) SResult.get(aLi, "endDate")); - file.addProperty("Processing Level", (String) SResult.get(aLi, "processingLevel")); - - file.addProperty("Sensor", (String) SResult.get(aLi, "sensors")); - } - JsonElement fileListElement = gson.toJsonTree(fileList); - - JsonObject pDResults = new JsonObject(); - pDResults.add("PDResults", fileListElement); - return pDResults.toString(); - } -} diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/package-info.java b/core/src/main/java/org/apache/sdap/mudrod/ssearch/package-info.java deleted file mode 100644 index b635b64..0000000 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/package-info.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes classes for semantic search, such as click stream importer, - * query dispatcher, semantic searcher, and ranker (ranksvm, ordinal/linear regression) - */ -package org.apache.sdap.mudrod.ssearch; diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/Evaluator.java b/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/Evaluator.java deleted file mode 100644 index 0efb82f..0000000 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/Evaluator.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ssearch.ranking; - -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -/** - * Supports ability to evaluating ranking results - */ -public class Evaluator { - /** - * Method of calculating NDCG score - * - * @param list a list of integer with each integer element indicating - * the performance at its position - * @param k the number of elements needed to be included in the calculation - * @return NDCG score - */ - public double getNDCG(int[] list, int k) { - double dcg = this.getDCG(list, k); - double idcg = this.getIDCG(list, k); - double ndcg = 0.0; - if (idcg > 0.0) { - ndcg = dcg / idcg; - } - return ndcg; - } - - /** - * Method of getting the precision of a list at position K - * - * @param list a list of integer with each integer element indicating - * the performance at its position - * @param k the number of elements needed to be included in the calculation - * @return precision at K - */ - public double getPrecision(int[] list, int k) { - int size = list.length; - if (size == 0 || k == 0) { - return 0; - } - - if (k > size) { - k = size; - } - - int relDocNum = this.getRelevantDocNum(list, k); - return (double) relDocNum / (double) k; - } - - /** - * Method of getting the number of relevant element in a ranking results - * - * @param list a list of integer with each integer element indicating - * the performance at its position - * @param k the number of elements needed to be included in the calculation - * @return the number of relevant element - */ - private int getRelevantDocNum(int[] list, int k) { - int size = list.length; - if (size == 0 || k == 0) { - return 0; - } - - if (k > size) { - k = size; - } - - int relNum = 0; - for (int i = 0; i < k; i++) { - if (list[i] > 3) { // 3 refers to "OK" - relNum++; - } - } - return relNum; - } - - /** - * Method of calculating DCG score from a list of ranking results - * - * @param list a list of integer with each integer element indicating - * the performance at its position - * @param k the number of elements needed to be included in the calculation - * @return DCG score - */ - private double getDCG(int[] list, int k) { - int size = list.length; - if (size == 0 || k == 0) { - return 0.0; - } - - if (k > size) { - k = size; - } - - double dcg = list[0]; - for (int i = 1; i < k; i++) { - int rel = list[i]; - int pos = i + 1; - double relLog = Math.log(pos) / Math.log(2); - dcg += rel / relLog; - } - return dcg; - } - - /** - * Method of calculating ideal DCG score from a list of ranking results - * - * @param list a list of integer with each integer element indicating - * the performance at its position - * @param k the number of elements needed to be included in the calculation - * @return IDCG score - */ - private double getIDCG(int[] list, int k) { - Comparator comparator = new Comparator() { - @Override - public int compare(Integer o1, Integer o2) { - return o2.compareTo(o1); - } - }; - List sortlist = IntStream.of(list).boxed().collect(Collectors.toList()); - Collections.sort(sortlist, comparator); - int[] sortedArr = sortlist.stream().mapToInt(i -> i).toArray(); - return this.getDCG(sortedArr, k); - } - -} diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/structure/SResult.java b/core/src/main/java/org/apache/sdap/mudrod/ssearch/structure/SResult.java deleted file mode 100644 index 81de2b4..0000000 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/structure/SResult.java +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ssearch.structure; - -import java.lang.reflect.Field; - -/** - * Data structure class for search result - */ -public class SResult { - public static final String rlist[] = { "term_score", "releaseDate_score", /*"versionNum_score",*/ - "processingL_score", "allPop_score", "monthPop_score", "userPop_score"/*, "termAndv_score"*/ }; - String shortName = null; - String longName = null; - String topic = null; - String description = null; - String relase_date = null; - - public Double final_score = 0.0; - public Double term_score = 0.0; - public Double releaseDate_score = 0.0; - public Double versionNum_score = 0.0; - public Double processingL_score = 0.0; - public Double click_score = 0.0; - public Double allPop_score = 0.0; - public Double monthPop_score = 0.0; - public Double userPop_score = 0.0; - public Double termAndv_score = 0.0; - public Integer below = 0; - - public Double Dataset_LongName_score = null; - public Double Dataset_Metadata_score = null; - public Double DatasetParameter_Term_score = null; - public Double DatasetSource_Source_LongName_score = null; - public Double DatasetSource_Sensor_LongName_score = null; - - public String version = null; - public String processingLevel = null; - public String latency = null; - public String stopDateLong = null; - public String stopDateFormat = null; - public Double spatialR_Sat = null; - public Double spatialR_Grid = null; - public String temporalR = null; - - public Double releaseDate = null; - public Double click = null; - public Double term = null; - public Double versionNum = null; - public Double processingL = null; - public Double allPop = null; - public Double monthPop = null; - public Double userPop = null; - public Double termAndv = null; - - public Double Dataset_LongName = null; - public Double Dataset_Metadata = null; - public Double DatasetParameter_Term = null; - public Double DatasetSource_Source_LongName = null; - public Double DatasetSource_Sensor_LongName = null; - - public Double prediction = 0.0; - public String label = null; - - public String startDate; - public String endDate; - public String sensors; - - /** - * @param shortName short name of dataset - * @param longName long name of dataset - * @param topic topic of dataset - * @param description description of dataset - * @param date release date of dataset - */ - public SResult(String shortName, String longName, String topic, String description, String date) { - this.shortName = shortName; - this.longName = longName; - this.topic = topic; - this.description = description; - this.relase_date = date; - } - - public SResult(SResult sr) { - for (String aRlist : rlist) { - set(this, aRlist, get(sr, aRlist)); - } - } - - /** - * Method of getting export header - * - * @param delimiter the delimiter used to separate strings - * @return header - */ - public static String getHeader(String delimiter) { - String str = ""; - for (String aRlist : rlist) { - str += aRlist + delimiter; - } - str = str + "label" + "\n"; - return "ShortName" + delimiter + "below" + delimiter + str; - } - - /** - * Method of get a search results as string - * - * @param delimiter the delimiter used to separate strings - * @return search result as string - */ - public String toString(String delimiter) { - String str = ""; - for (String aRlist : rlist) { - double score = get(this, aRlist); - str += score + delimiter; - } - str = str + label + "\n"; - return shortName + delimiter + below + delimiter + str; - } - - /** - * Generic setter method - * - * @param object instance of SResult - * @param fieldName field name that needs to be set on - * @param fieldValue field value that needs to be set to - * @return 1 means success, and 0 otherwise - */ - public static boolean set(Object object, String fieldName, Object fieldValue) { - Class clazz = object.getClass(); - while (clazz != null) { - try { - Field field = clazz.getDeclaredField(fieldName); - field.setAccessible(true); - field.set(object, fieldValue); - return true; - } catch (NoSuchFieldException e) { - clazz = clazz.getSuperclass(); - } catch (Exception e) { - throw new IllegalStateException(e); - } - } - return false; - } - - /** - * Generic getter method - * - * @param object instance of SResult - * @param fieldName field name of search result - * @param data type - * @return the value of the filed in the object - */ - @SuppressWarnings("unchecked") - public static V get(Object object, String fieldName) { - Class clazz = object.getClass(); - while (clazz != null) { - try { - Field field = clazz.getDeclaredField(fieldName); - field.setAccessible(true); - return (V) field.get(object); - } catch (NoSuchFieldException e) { - clazz = clazz.getSuperclass(); - } catch (Exception e) { - throw new IllegalStateException(e); - } - } - return null; - } - -} diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/structure/package-info.java b/core/src/main/java/org/apache/sdap/mudrod/ssearch/structure/package-info.java deleted file mode 100644 index 5e75a40..0000000 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/structure/package-info.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes data structure needed for ranking process - */ -package org.apache.sdap.mudrod.ssearch.structure; diff --git a/core/src/main/java/org/apache/sdap/mudrod/weblog/process/ClickStreamAnalyzer.java b/core/src/main/java/org/apache/sdap/mudrod/weblog/process/ClickStreamAnalyzer.java index 70c4067..fcfe688 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/weblog/process/ClickStreamAnalyzer.java +++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/process/ClickStreamAnalyzer.java @@ -18,7 +18,6 @@ import org.apache.sdap.mudrod.driver.SparkDriver; import org.apache.sdap.mudrod.main.MudrodConstants; import org.apache.sdap.mudrod.semantics.SVDAnalyzer; -import org.apache.sdap.mudrod.ssearch.ClickstreamImporter; import org.apache.sdap.mudrod.utils.LinkageTriple; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -62,8 +61,8 @@ public Object execute() { svd.saveToES(tripleList, props.getProperty(MudrodConstants.ES_INDEX_NAME), MudrodConstants.CLICK_STREAM_LINKAGE_TYPE); // Store click stream in ES for the ranking use - ClickstreamImporter cs = new ClickstreamImporter(props, es, spark); - cs.importfromCSVtoES(); + //ClickstreamImporter cs = new ClickstreamImporter(props, es, spark); + //cs.importfromCSVtoES(); } } catch (Exception e) { LOG.error("Encountered an error during execution of ClickStreamAnalyzer.", e); diff --git a/core/src/main/resources/javaSVMWithSGDModel/data/_SUCCESS b/core/src/main/resources/javaSVMWithSGDModel/data/_SUCCESS deleted file mode 100644 index e69de29..0000000 diff --git a/core/src/main/resources/javaSVMWithSGDModel/data/_common_metadata b/core/src/main/resources/javaSVMWithSGDModel/data/_common_metadata deleted file mode 100644 index cafbf1b4cd44b0290ced70a047b59064c2190f9d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1022 zcmb7@T}vB56o!Xws#Uz*vVl@$Ku}1R+U&-VUKPFYN~GA{h?J9=lVs@ZhcmO4ZOA|D zo&Ti2pflNZ8c~B@Ewgjp^FHTe&cA#;41?~U(;Z!c`uRh%8D*H2a)(zp4p%(pO3FuANfG+XwDCxL1XwC9UN*GI9$ONRu9=)rcj^>L+ z&sXVY=1Zim%TrYM^tMA!%iBr&44xO1g4+IWr`=SulAjg10g>x7%h@t6R;kyzHf!@- zQ(jcA5q4xR_q?*bTIuClD!-y0jC$+;<5-)*Z}m%4TrEiSxn1@TncCeSS3>gL{wJ0U n21$RIqzFSe`k1nQ$`fS62^*vdv){?^aKI19-&cg3+n4_jb&^Qd diff --git a/core/src/main/resources/javaSVMWithSGDModel/data/_metadata b/core/src/main/resources/javaSVMWithSGDModel/data/_metadata deleted file mode 100644 index 6bfec98c70dc1104bbcb3a5b730a683091c8921a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1757 zcmb`I%WB&|6o$t_vE7iZgaoF9ftUm$;ftgwsiBl+kyQyL&8CEQERAiEvFvC@4NfqH z?AmAOv!q)uN}+VwRh@U}s*BFZvf{L{X-EXV$U5hI|GAC#pS`RqlKy3Hh4xW&{DyiR z+(k+r*pgN_#X-Bns8R%qv<5Vp4f=TPvAB^mnFeP#u?j*z@Gw;*;U#M_2`E#fiAw5BC|E+_O%EX{Segph|j4Ct8DxWkzIF#AV=0 z#YMPMH=L$ZHLUG=)o7yHwo%)vS1cdf$aXEWeP(i}lNd9#SmG($`6AHQr#U9MDY7c5 zhw53m*wEx006b;!uJRYSC-Ne93saOZVY=$u@XA|4Z zC-#MdM-b)q@E>fKkf3RjAVj2X9%7I6qF1{I+@n8up8E6$6NPh4x%7iXUr6EcXH_qxPEN)Dh5Ef%6oR+x?#v@ztC?CZRF$2;nJ}24 zuiLROL^k|)4RJIC+LQFGHoDR3dNU4)zgn+(mg87;*RnBk(N@Lw>b7rT&uw{*ZF&5& R core + ranking service web diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java new file mode 100644 index 0000000..315b8bd --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java @@ -0,0 +1,84 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.common; + +import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.ranking.traindata.ExpertData; +import org.apache.sdap.mudrod.ranking.traindata.TrainingImporter; +import org.apache.spark.SparkContext; +import org.apache.spark.mllib.classification.SVMModel; +import org.apache.spark.mllib.regression.LabeledPoint; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.util.Properties; + +/** + * Supports the ability to importing classifier into memory + */ +public abstract class Learner extends MudrodAbstract { + + public Learner(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + // TODO Auto-generated constructor stub + } + + /** + * Constructor to load in spark SVM classifier + * + * @param classifierName + * classifier type + * @param skd + * an instance of spark driver + * @param svmSgdModel + * path to a trained model + */ + + public String extractTrainDataFromExperts(String sourceDir){ + File sourceFile = new File(sourceDir); + boolean bDir = sourceFile.isDirectory(); + boolean multFiles = false; + if(bDir){ + multFiles = true; + } + + String resultDir = sourceFile.getParent() + "/trainsets.txt"; + ExpertData converter = new ExpertData(sourceDir, resultDir, true); + converter.convert2TrainSet(); + + return resultDir; + } + + /** + * Method of classifying instance + * + * @param p + * the instance that needs to be classified + * @return the class id + */ + //public abstract double classify(LabeledPoint p); + + public abstract String prepareTrainData(String sourceDir); + + public abstract void train(String trainFile); + + public abstract double predict(double[] value); + + public abstract void save(); + + public abstract void load(String model); +} diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/Learner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java similarity index 57% rename from core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/Learner.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java index 8a752a3..1aa0fdf 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/Learner.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java @@ -11,47 +11,44 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.sdap.mudrod.ssearch.ranking; +package org.apache.sdap.mudrod.ranking.common; +import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.main.MudrodConstants; +import org.apache.sdap.mudrod.ranking.ranksvm.SVMLearner; import org.apache.spark.SparkContext; import org.apache.spark.mllib.classification.SVMModel; import org.apache.spark.mllib.regression.LabeledPoint; import java.io.Serializable; +import java.util.Properties; /** * Supports the ability to importing classifier into memory */ -public class Learner implements Serializable { - /** - * - */ - private static final long serialVersionUID = 1L; - SVMModel model = null; - transient SparkContext sc = null; +public class LearnerFactory extends MudrodAbstract { - /** - * Constructor to load in spark SVM classifier - * - * @param classifierName classifier type - * @param skd an instance of spark driver - * @param svmSgdModel path to a trained model - */ - public Learner(SparkDriver skd, String svmSgdModel) { - sc = skd.sc.sc(); - sc.addFile(svmSgdModel, true); - model = SVMModel.load(sc, svmSgdModel); - } + public LearnerFactory(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + // TODO Auto-generated constructor stub + } - /** +/** * Method of classifying instance * * @param p the instance that needs to be classified * @return the class id */ - public double classify(LabeledPoint p) { - return model.predict(p.features()); + + public Learner createLearner(){ + if("1".equals(props.getProperty(MudrodConstants.RANKING_ML))) + return new SVMLearner(props, es, spark, props.getProperty(MudrodConstants.RANKING_MODEL)); + + return null; + } + } diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java index be2f5c9..cb3c9a8 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java @@ -17,8 +17,7 @@ import org.apache.sdap.mudrod.driver.ESDriver; import org.apache.sdap.mudrod.driver.SparkDriver; import org.apache.sdap.mudrod.main.MudrodConstants; -import org.apache.sdap.mudrod.ssearch.ranking.Learner; -import org.apache.sdap.mudrod.ssearch.structure.SResult; +import org.apache.sdap.mudrod.ranking.structure.SResult; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; @@ -36,8 +35,9 @@ public class Ranker extends MudrodAbstract implements Serializable { public Ranker(Properties props, ESDriver es, SparkDriver spark) { super(props, es, spark); - if("1".equals(props.getProperty(MudrodConstants.RANKING_ML))) - le = new Learner(spark, props.getProperty(MudrodConstants.RANKING_MODEL)); + + LearnerFactory factory = new LearnerFactory(props, es, spark); + le = factory.createLearner(); } /** @@ -160,11 +160,9 @@ public int compare(SResult o1, SResult o2) { } double[] ins = instList.stream().mapToDouble(i -> i).toArray(); - LabeledPoint insPoint = new LabeledPoint(99.0, Vectors.dense(ins)); - int prediction = (int)le.classify(insPoint); + int prediction = (int)le.predict(ins); return prediction; } } - -} +} \ No newline at end of file diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Searcher.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Searcher.java index e3f7660..6419516 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Searcher.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Searcher.java @@ -20,7 +20,7 @@ import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; import org.apache.sdap.mudrod.driver.ESDriver; import org.apache.sdap.mudrod.driver.SparkDriver; -import org.apache.sdap.mudrod.ssearch.structure.SResult; +import org.apache.sdap.mudrod.ranking.structure.SResult; import org.elasticsearch.action.search.SearchRequestBuilder; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.index.query.BoolQueryBuilder; diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/SparkFormatter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkFormatter.java similarity index 97% rename from core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/SparkFormatter.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkFormatter.java index ad61607..356365b 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/SparkFormatter.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkFormatter.java @@ -1,4 +1,4 @@ -package org.apache.sdap.mudrod.ssearch.ranking; +package org.apache.sdap.mudrod.ranking.dlrank; import java.io.*; import java.text.DecimalFormat; diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/SparkSVM.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkSVM.java similarity index 97% rename from core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/SparkSVM.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkSVM.java index 0d0eb8d..388c632 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/SparkSVM.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkSVM.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.sdap.mudrod.ssearch.ranking; +package org.apache.sdap.mudrod.ranking.dlrank; import org.apache.sdap.mudrod.main.MudrodEngine; import org.apache.spark.api.java.JavaRDD; diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/predict/OfflineLearner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/predict/OfflineLearner.java deleted file mode 100644 index f7711b8..0000000 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/predict/OfflineLearner.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ranking.offline.predict; - -import org.apache.sdap.mudrod.driver.SparkDriver; -import org.apache.spark.SparkContext; -import org.apache.spark.mllib.classification.SVMModel; -import org.apache.spark.mllib.regression.LabeledPoint; - -import java.io.Serializable; - -/** - * Supports the ability to importing classifier into memory - */ -public class OfflineLearner implements Serializable { - /** - * - */ - private static final long serialVersionUID = 1L; - SVMModel model = null; - transient SparkContext sc = null; - - /** - * Constructor to load in spark SVM classifier - * - * @param classifierName classifier type - * @param skd an instance of spark driver - * @param svmSgdModel path to a trained model - */ - public OfflineLearner(SparkDriver skd, String svmSgdModel) { - sc = skd.sc.sc(); - sc.addFile(svmSgdModel, true); - model = SVMModel.load(sc, svmSgdModel); - } - - /** - * Method of classifying instance - * - * @param p the instance that needs to be classified - * @return the class id - */ - public double classify(LabeledPoint p) { - return model.predict(p.features()); - } - -} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/ClickstreamImporter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/ClickstreamImporter.java deleted file mode 100644 index 35bba2e..0000000 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/ClickstreamImporter.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ranking.offline.train; - -import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; -import org.apache.sdap.mudrod.driver.ESDriver; -import org.apache.sdap.mudrod.driver.SparkDriver; -import org.apache.sdap.mudrod.main.MudrodConstants; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.common.xcontent.XContentBuilder; - -import java.io.BufferedReader; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.IOException; -import java.util.Properties; - -import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; - -/** - * Supports ability to import click stream data into Elasticsearch - * through .csv file - */ -public class ClickstreamImporter extends MudrodAbstract { - /** - * - */ - private static final long serialVersionUID = 1L; - - public ClickstreamImporter(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - addClickStreamMapping(); - } - - /** - * Method to add Elasticsearch mapping for click stream data - */ - public void addClickStreamMapping() { - XContentBuilder Mapping; - try { - Mapping = jsonBuilder().startObject().startObject( - props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)).startObject( - "properties").startObject("query").field("type", "string").field( - "index", "not_analyzed").endObject().startObject("dataID").field( - "type", "string").field("index", "not_analyzed").endObject() - - .endObject().endObject().endObject(); - - es.getClient().admin().indices().preparePutMapping( - props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType( - props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)).setSource( - Mapping).execute().actionGet(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Method to import click stream CSV into Elasticsearch - */ - public void importfromCSVtoES() { - es.deleteType(props.getProperty(MudrodConstants.ES_INDEX_NAME), - props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)); - es.createBulkProcessor(); - - BufferedReader br = null; - String cvsSplitBy = ","; - - try { - br = new BufferedReader(new FileReader(props.getProperty(MudrodConstants.CLICKSTREAM_PATH))); - String line = br.readLine(); - // first item needs to be skipped - String[] dataList = line.split(cvsSplitBy); - while ((line = br.readLine()) != null) { - String[] clicks = line.split(cvsSplitBy); - for (int i = 1; i < clicks.length; i++) { - if (!"0.0".equals(clicks[i])) { - IndexRequest ir = new IndexRequest(props.getProperty(MudrodConstants.ES_INDEX_NAME), - props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)) - .source(jsonBuilder().startObject().field("query", clicks[0]).field( - "dataID", dataList[i]).field("clicks", clicks[i]).endObject()); - es.getBulkProcessor().add(ir); - } - } - } - } catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } finally { - if (br != null) { - try { - br.close(); - es.destroyBulkProcessor(); - } catch (IOException e) { - e.printStackTrace(); - } - } - } - } - -} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/OfflineDataGenerator.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/OfflineDataGenerator.java deleted file mode 100644 index 9a55523..0000000 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/OfflineDataGenerator.java +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ranking.offline.train; - -import au.com.bytecode.opencsv.CSVReader; -import au.com.bytecode.opencsv.CSVWriter; - -import java.io.*; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * SVMData is a program designed to create appropriate input data for the RankSVM - * algorithm that involves Pairwise Classification. Specifically, instead of working in - * the space of query-document vectors, e.g. x1, x2, x3, we transform them into a new space - * in which a pair of documents is represented as the difference between their feature vectors. - */ -public class OfflineDataGenerator { - private static String mySourceDir; - private static String myResultDir; - private static boolean isMultFiles; - - private static String[] myHeader; - private static List> myMasterList = new ArrayList<>(); - - // HashMap used for comparing evaluation classes - public static final Map map1 = new HashMap<>(); - - static { - map1.put("Excellent", 7); - map1.put("Very good", 6); - map1.put("Good", 5); - map1.put("OK", 4); - map1.put("Bad", 3); - map1.put("Very bad", 2); - map1.put("Terrible", 1); - } - - /** - * Constructor which takes in path containing one or multiple files to process. - * Also takes in argument specifying whether or not a single file needs to be processed, - * or multiple files need to be processed. - * - * @param sourceDir directory containing single file or multiple files to be processed - * @param resultDir output folder - * @param multFiles true if multiple files in directory need to be processed and false if - * only a single file needs to be processed - */ - public OfflineDataGenerator(String sourceDir, String resultDir, boolean multFiles) { - mySourceDir = sourceDir; - myResultDir = resultDir; - isMultFiles = multFiles; - } - - /** - * Responsible for invoking the processing of data file(s) and their subsequent storage - * into a user specified directory. - */ - public void process() { - parseFile(); - writeCSVfile(myMasterList); - } - - /** - * Parses the original user-specified CSV file, storing the contents for future calculations - * and formatting. - */ - public static void parseFile() { - String[][] dataArr = null; - try { - String sourceDir = mySourceDir; - - if (isMultFiles) // Case where multiple files have to be processed - { - // Iterate over files in directory - File directory = new File(sourceDir); - File[] directoryListing = directory.listFiles(); - - if (directoryListing != null) { - for (File child : directoryListing) { - CSVReader csvReader = new CSVReader(new FileReader(child)); - List list = csvReader.readAll(); - - // Store into 2D array by transforming array list to normal array - dataArr = new String[list.size()][]; - dataArr = list.toArray(dataArr); - - calculateVec(dataArr); - - csvReader.close(); - } - storeHead(dataArr); // Store the header - } - } else // Process only one file - { - File file = new File(sourceDir); - - if (file != null) { - CSVReader csvReader = new CSVReader(new FileReader(file)); - List list = csvReader.readAll(); - - // Store into 2D array by transforming array list to normal array - dataArr = new String[list.size()][]; - dataArr = list.toArray(dataArr); - - storeHead(dataArr); // Store the header - calculateVec(dataArr); - - csvReader.close(); - } - } - } catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Performs the necessary vector calculations on each possible combination of vectors, - * also storing a value that indicates the evaluation. - * - * @param arr the parsed contents of the original CSV file - */ - public static void calculateVec(String[][] arr) { - List> listofLists = new ArrayList<>(); // Holds calculations - - int rowStart = 1; - for (int row = rowStart; row < arr.length; row++) // Start at row 1 because row 0 is heading lol - { - for (int i = 1; i < arr.length - row; i++) { - List colList = new ArrayList(); // create vector to store all values inside of a column, which is stored inside 2D vector - for (int col = 0; col < arr[0].length - 1; col++) // Columns go until the next to last column - { - // Extract double value from each cell - double x1 = Double.parseDouble(arr[row][col]); - double x2 = Double.parseDouble(arr[row + i][col]); - - // Perform calculation for each cell - double result = x1 - x2; - - // Convert this double value into string, and store inside array list - String strResult = Double.toString(result); - colList.add(strResult); - } - - // Finally, add either 1, -1, or do not add row at all when encountering evaluation value - int addEvalNum = compareEvaluation(arr[row][arr[0].length - 1], arr[row + i][arr[0].length - 1]); - if (addEvalNum == 1) { - colList.add("1"); - listofLists.add(colList); // Add this list to 2D list - row is finished now, move on - } else if (addEvalNum == -1) { - colList.add("-1"); - listofLists.add(colList); // Add this list to 2D list - row is finished now, move on - } - // Else, they are equal, do not even add this row to 2D vector - } - } - - // After all processing takes place, send to method that recreates data with equal # of 1's and -1's - List> equalizedList = equalizeList(listofLists); - myMasterList.addAll(equalizedList); - } - - /** - * Taking in two vector evaluation parameters, compares these two evaluations, returning a 1 - * if the first evaluation is greater than the second, a -1 in the case the first evaluation is - * less than the second, and a 10 in the case that the two are equal, meaning this vector will - * not be used. - * - * @param eval1 evaluation from first vector - * @param eval2 evaluation from second vector - * @return 1 if first evaluation is greater than the second, -1 if first evaluation is less than the second, and - * 10 in the case that the two are equal - */ - public static int compareEvaluation(String eval1, String eval2) { - int evalNum1 = map1.get(eval1); - int evalNum2 = map1.get(eval2); - - if (evalNum1 > evalNum2) // ">" means it is more relevant - assign a 1 - { - return 1; - } else if (evalNum1 < evalNum2) { - return -1; - } else { - return 10; // Return 10 if they are equal - signifies you cannot use the row - } - } - - /** - * After vector calculations and new evaluation values are set, produces refined output data such that - * there is an equal or close to equal number of rows containing both "1" and "-1" as the new evaluation value. - * - * @param rawList originally calculated data from the input CSV file - * @return data that has an equal distribution of evaluation values - */ - public static List> equalizeList(List> rawList) { - // Create two sets - one containing row index for +1 and the other for -1 - List pos1List = new ArrayList<>(); - List neg1List = new ArrayList<>(); - - for (int i = 0; i < rawList.size(); i++) // Iterate through all rows to get indexes - { - int evalNum = Integer.parseInt(rawList.get(i).get(rawList.get(0).size() - 1)); // Get 1 or -1 from original array list - if (evalNum == 1) { - pos1List.add(i); // Add row index that has 1 - } else if (evalNum == -1) { - neg1List.add(i); // Add row index that has -1 - } - } - - int totPosCount = pos1List.size(); // Total # of 1's - int totNegCount = neg1List.size(); // Total # of -1's - - if ((totPosCount - totNegCount) >= 1) // There are more 1's than -1's, equalize them - { - int indexOfPosList = 0; // Start getting indexes from the first index of positive index location list - while ((totPosCount - totNegCount) >= 1) // Keep going until we have acceptable amount of both +1 and -1 - { - int pos1IndexVal = pos1List.get(indexOfPosList); // Get index from previously made list of indexes - for (int col = 0; col < rawList.get(0).size(); col++) // Go through elements of indexed row, negating it to transform to -1 row - { - double d = Double.parseDouble(rawList.get(pos1IndexVal).get(col)); // Transform to double first - d = d * -1; // Negate it - String negatedValue = Double.toString(d); // Change back to String - rawList.get(pos1IndexVal).set(col, negatedValue);// Put this value back into dat row - } - - totPosCount--; // We changed a +1 row to a -1 row, decrement count of positives - totNegCount++; // Increment count of negatives - indexOfPosList++; // Get next +1 location in raw data - } - } else if ((totNegCount - totPosCount) > 1) // There are more -1's than 1's, equalize them - { - int indexOfNegList = 0; - while ((totNegCount - totPosCount) > 1) // Keep going until we have acceptable amount of both +1 and -1 - { - int neg1IndexVal = neg1List.get(indexOfNegList); // Get index from previously made list of indexes - for (int col = 0; col < rawList.get(0).size(); col++) // Go through elements of indexed row, negating it to transform to +1 row - { - double d = Double.parseDouble(rawList.get(neg1IndexVal).get(col)); // Transform to double first - d = d * -1; // Negate it - String negatedValue = Double.toString(d); // Change back to String - rawList.get(neg1IndexVal).set(col, negatedValue);// Put this value back into dat row - } - - totNegCount--; // We changed a -1 row to a +1 row, decrement count of negatives now - totPosCount++; // Increment count of positives - indexOfNegList++; // Get next -1 location in raw data - } - } else { - // Do nothing - rows are within acceptable equality bounds of plus or minus 1 - } - - return rawList; - } - - /** - * Retrieves the heading from a file to be processed so it can be written to the output file later. - * - * @param arr 2D array containing the parsed information from input file - */ - public static void storeHead(String[][] arr) { - myHeader = new String[arr[0].length]; // Reside private variable - - System.arraycopy(arr[0], 0, myHeader, 0, arr[0].length); - } - - /** - * Writes newly calculated and equally distributed vector data to user specified CSV file. - * - * @param list finalized vector data to write to user specified output file - */ - public static void writeCSVfile(List> list) { - String outputFile = myResultDir; - boolean alreadyExists = new File(outputFile).exists(); - - try { - CSVWriter csvOutput = new CSVWriter(new FileWriter(outputFile), ','); // Create new instance of CSVWriter to write to file output - - if (!alreadyExists) { - csvOutput.writeNext(myHeader); // Write the text headers first before data - - for (List aList : list) { // Iterate through all rows in 2D array - String[] temp = new String[aList.size()]; // Convert row array list in 2D array to regular string array - temp = aList.toArray(temp); - csvOutput.writeNext(temp); // Write this array to the file - } - } - - csvOutput.close(); // Close csvWriter - } catch (IOException e) { - e.printStackTrace(); - } - } - -} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/TrainingImporter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/TrainingImporter.java deleted file mode 100644 index 12e1173..0000000 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/TrainingImporter.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ranking.offline.train; - -import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; -import org.apache.sdap.mudrod.driver.ESDriver; -import org.apache.sdap.mudrod.driver.SparkDriver; -import org.apache.sdap.mudrod.main.MudrodConstants; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.index.query.QueryBuilders; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.Properties; - -import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; - -/** - * Supports the ability to importing training set into Elasticsearch - */ -public class TrainingImporter extends MudrodAbstract { - /** - * - */ - private static final long serialVersionUID = 1L; - - public TrainingImporter(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - es.deleteAllByQuery(props.getProperty(MudrodConstants.ES_INDEX_NAME), "trainingranking", QueryBuilders.matchAllQuery()); - addMapping(); - } - - /** - * Method of adding mapping to traning set type - */ - public void addMapping() { - XContentBuilder Mapping; - try { - Mapping = jsonBuilder().startObject().startObject("trainingranking").startObject("properties").startObject("query").field("type", "string").field("index", "not_analyzed").endObject() - .startObject("dataID").field("type", "string").field("index", "not_analyzed").endObject().startObject("label").field("type", "string").field("index", "not_analyzed").endObject().endObject() - .endObject().endObject(); - - es.getClient().admin().indices().preparePutMapping(props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType("trainingranking").setSource(Mapping).execute().actionGet(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Method of importing training set in to Elasticsearch - * - * @param dataFolder the path to the traing set - * @throws IOException IOException - */ - public void importTrainingSet(String dataFolder) throws IOException { - es.createBulkProcessor(); - - File[] files = new File(dataFolder).listFiles(); - for (File file : files) { - BufferedReader br = new BufferedReader(new FileReader(file.getAbsolutePath())); - br.readLine(); - String line = br.readLine(); - while (line != null) { - String[] list = line.split(","); - String query = file.getName().replace(".csv", ""); - if (list.length > 0) { - IndexRequest ir = new IndexRequest(props.getProperty(MudrodConstants.ES_INDEX_NAME), "trainingranking") - .source(jsonBuilder().startObject().field("query", query).field("dataID", list[0]).field("label", list[list.length - 1]).endObject()); - es.getBulkProcessor().add(ir); - } - line = br.readLine(); - } - br.close(); - } - es.destroyBulkProcessor(); - } -} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/package-info.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/package-info.java deleted file mode 100644 index e067620..0000000 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/package-info.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes classes for importing training data, ML models, - * generating input data for RankSVM, and evaluating ranking results - */ -package org.apache.sdap.mudrod.ranking.offline.train; diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java new file mode 100644 index 0000000..875ef45 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java @@ -0,0 +1,120 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.ranksvm; + +import java.io.File; +import java.util.Properties; + +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.main.MudrodEngine; +import org.apache.sdap.mudrod.ranking.common.Learner; +import org.apache.sdap.mudrod.ranking.common.LearnerFactory; +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.classification.SVMModel; +import org.apache.spark.mllib.classification.SVMWithSGD; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; + +/** + * Supports the ability to importing classifier into memory + */ +public class SVMLearner extends Learner { + /** + * + */ + private static final long serialVersionUID = 1L; + SVMModel model = null; + transient SparkContext sc = null; + + /** + * Constructor to load in spark SVM classifier + * + * @param classifierName + * classifier type + * @param skd + * an instance of spark driver + * @param svmSgdModel + * path to a trained model + */ + public SVMLearner(Properties props, ESDriver es, SparkDriver spark, String svmSgdModel) { + super(props, es, spark); + + sc = spark.sc.sc(); + load(svmSgdModel); + } + + public String prepareTrainData(String sourceDir) { + + //add other source, such as log, streaming + String resultFile = this.extractTrainDataFromExperts(sourceDir); + + //String path = SVMLearner.class.getClassLoader().getResource("trainsets").toString(); + String path = new File(resultFile).getParent(); + String svmSparkFile = path + "/inputDataForSVM_spark.txt"; + SparkFormatter sf = new SparkFormatter(); + sf.toSparkSVMformat(resultFile,svmSparkFile); + + return svmSparkFile; + } + + @Override + public void train(String trainFile) { + //String path = SVMLearner.class.getClassLoader().getResource("inputDataForSVM_spark.txt").toString(); + JavaRDD data = MLUtils.loadLibSVMFile(sc, trainFile).toJavaRDD(); + + // Run training algorithm to build the model. + int numIterations = 100; + model = SVMWithSGD.train(data.rdd(), numIterations); + } + + @Override + public double predict(double[] value) { + LabeledPoint p = new LabeledPoint(99.0, Vectors.dense(value)); + return model.predict(p.features()); + } + + @Override + public void save() { + // Save and load model + String modelPath = SVMLearner.class.getClassLoader().getResource("javaSVMWithSGDModel").toString(); + model.save(sc, modelPath); + } + + @Override + public void load(String svmSgdModel) { + // TODO Auto-generated method stub + sc.addFile(svmSgdModel, true); + model = SVMModel.load(sc, svmSgdModel); + } + + public static void main(String[] arg0) { + MudrodEngine me = new MudrodEngine(); + Properties props = me.loadConfig(); + + SparkDriver spark = new SparkDriver(me.getConfig()); + ESDriver es = new ESDriver(me.getConfig()); + + LearnerFactory factory = new LearnerFactory(props, es, spark); + Learner le = factory.createLearner(); + + String sourceDir = "E://data//mudrod//ranking//rankingResults//training//training_data_v4"; + String trainFile = le.prepareTrainData(sourceDir); + le.train(trainFile); + le.save(); + } +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkFormatter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkFormatter.java similarity index 96% rename from ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkFormatter.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkFormatter.java index 3417066..8347bd9 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkFormatter.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkFormatter.java @@ -1,4 +1,4 @@ -package org.apache.sdap.mudrod.ranking.offline.train; +package org.apache.sdap.mudrod.ranking.ranksvm; import java.io.*; import java.text.DecimalFormat; @@ -7,6 +7,7 @@ public class SparkFormatter { DecimalFormat NDForm = new DecimalFormat("#.###"); public SparkFormatter() { + } public void toSparkSVMformat(String inputCSVFileName, String outputTXTFileName) { @@ -51,5 +52,4 @@ public static void main(String[] args) { SparkFormatter sf = new SparkFormatter(); sf.toSparkSVMformat("C:/mudrodCoreTestData/rankingResults/inputDataForSVM.csv", "C:/mudrodCoreTestData/rankingResults/inputDataForSVM_spark.txt"); } - } diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkSVM.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkSVM.java similarity index 96% rename from ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkSVM.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkSVM.java index d8a52ae..abedb76 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/offline/train/SparkSVM.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkSVM.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.sdap.mudrod.ranking.offline.train; +package org.apache.sdap.mudrod.ranking.ranksvm; import org.apache.sdap.mudrod.main.MudrodEngine; import org.apache.spark.api.java.JavaRDD; @@ -43,7 +43,5 @@ public static void main(String[] args) { model.save(jsc.sc(), SparkSVM.class.getClassLoader().getResource("javaSVMWithSGDModel").toString()); jsc.sc().stop(); - } - } diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ClickstreamImporter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ClickstreamImporter.java similarity index 98% rename from core/src/main/java/org/apache/sdap/mudrod/ssearch/ClickstreamImporter.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ClickstreamImporter.java index 9121719..5b85b29 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ClickstreamImporter.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ClickstreamImporter.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.sdap.mudrod.ssearch; +package org.apache.sdap.mudrod.ranking.traindata; import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; import org.apache.sdap.mudrod.driver.ESDriver; diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/DataGenerator.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertData.java similarity index 93% rename from core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/DataGenerator.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertData.java index 4e43ec8..98ba6bb 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/DataGenerator.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertData.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.sdap.mudrod.ssearch.ranking; +package org.apache.sdap.mudrod.ranking.traindata; import au.com.bytecode.opencsv.CSVReader; import au.com.bytecode.opencsv.CSVWriter; @@ -28,7 +28,7 @@ * the space of query-document vectors, e.g. x1, x2, x3, we transform them into a new space * in which a pair of documents is represented as the difference between their feature vectors. */ -public class DataGenerator { +public class ExpertData { private static String mySourceDir; private static String myResultDir; private static boolean isMultFiles; @@ -59,7 +59,7 @@ public class DataGenerator { * @param multFiles true if multiple files in directory need to be processed and false if * only a single file needs to be processed */ - public DataGenerator(String sourceDir, String resultDir, boolean multFiles) { + public ExpertData(String sourceDir, String resultDir, boolean multFiles) { mySourceDir = sourceDir; myResultDir = resultDir; isMultFiles = multFiles; @@ -69,7 +69,7 @@ public DataGenerator(String sourceDir, String resultDir, boolean multFiles) { * Responsible for invoking the processing of data file(s) and their subsequent storage * into a user specified directory. */ - public void process() { + public void convert2TrainSet() { parseFile(); writeCSVfile(myMasterList); } @@ -143,7 +143,7 @@ public static void calculateVec(String[][] arr) { { for (int i = 1; i < arr.length - row; i++) { List colList = new ArrayList(); // create vector to store all values inside of a column, which is stored inside 2D vector - for (int col = 0; col < arr[0].length - 1; col++) // Columns go until the next to last column + for (int col = 1; col < arr[0].length - 2; col++) // Columns go until the next to last column { // Extract double value from each cell double x1 = Double.parseDouble(arr[row][col]); @@ -274,9 +274,9 @@ public static List> equalizeList(List> rawList) { * @param arr 2D array containing the parsed information from input file */ public static void storeHead(String[][] arr) { - myHeader = new String[arr[0].length]; // Reside private variable + myHeader = new String[arr[0].length-1]; // Reside private variable - System.arraycopy(arr[0], 0, myHeader, 0, arr[0].length); + System.arraycopy(arr[0], 1, myHeader, 0, arr[0].length-1); } /** @@ -287,18 +287,18 @@ public static void storeHead(String[][] arr) { public static void writeCSVfile(List> list) { String outputFile = myResultDir; boolean alreadyExists = new File(outputFile).exists(); - + if(alreadyExists){ + new File(outputFile).delete(); + } try { CSVWriter csvOutput = new CSVWriter(new FileWriter(outputFile), ','); // Create new instance of CSVWriter to write to file output - if (!alreadyExists) { - csvOutput.writeNext(myHeader); // Write the text headers first before data + csvOutput.writeNext(myHeader); // Write the text headers first before data - for (List aList : list) { // Iterate through all rows in 2D array - String[] temp = new String[aList.size()]; // Convert row array list in 2D array to regular string array - temp = aList.toArray(temp); - csvOutput.writeNext(temp); // Write this array to the file - } + for (List aList : list) { // Iterate through all rows in 2D array + String[] temp = new String[aList.size()]; // Convert row array list in 2D array to regular string array + temp = aList.toArray(temp); + csvOutput.writeNext(temp); // Write this array to the file } csvOutput.close(); // Close csvWriter diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/TrainingImporter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/TrainingImporter.java similarity index 98% rename from core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/TrainingImporter.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/TrainingImporter.java index ff55c85..a1f1dff 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/TrainingImporter.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/TrainingImporter.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.sdap.mudrod.ssearch.ranking; +package org.apache.sdap.mudrod.ranking.traindata; import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; import org.apache.sdap.mudrod.driver.ESDriver; diff --git a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/package-info.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/package-info.java similarity index 93% rename from core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/package-info.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/package-info.java index e25207e..41e9834 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/ssearch/ranking/package-info.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/package-info.java @@ -15,4 +15,4 @@ * This package includes classes for importing training data, ML models, * generating input data for RankSVM, and evaluating ranking results */ -package org.apache.sdap.mudrod.ssearch.ranking; +package org.apache.sdap.mudrod.ranking.traindata; diff --git a/ranking/src/main/resources/config.properties b/ranking/src/main/resources/config.properties deleted file mode 100644 index 4c8991e..0000000 --- a/ranking/src/main/resources/config.properties +++ /dev/null @@ -1,74 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); you -# may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Database configuration -mudrod.cluster.name=MudrodES -mudrod.es.transport.tcp.port = 9300 -mudrod.es.unicast.hosts = 127.0.0.1 -mudrod.es.http.port = 9200 -mudrod.es.index = mudrod - -# Spark related -# Log processing type. Possible values include 'sequential' or 'parallel' -mudrod.processing.type = parallel -mudrod.spark.app.name = MudrodSparkApp -mudrod.spark.master = local[4] -mudrod.spark.optimize = repartition - -# Web log processing configuration -# index name has to be all lowercase -mudrod.log.index = log -mudrod.ftp.prefix = FTP. -mudrod.http.prefix = WWW. -mudrod.base.url = http://podaac.jpl.nasa.gov -mudrod.black.request.list = .js, .css, .jpg, .png, .ico, image_captcha, autocomplete, .gif, /alldata/, /api/, get / http/1.1, .jpeg, /ws/ -mudrod.black.agent.list = crawler, googlebot, bingbot, slurp, yacybot, rogerbot, yandexbot, -, apache-httpclient, java, curl -mudrod.search.freq = 100 -mudrod.view.freq = 200 -mudrod.download.freq = 100 -mudrod.request.rate = 30 -mudrod.session.port = 8080 -mudrod.session.url = /mudrod-service/session.html -mudrod.request.time.gap = 600 -mudrod.view.url.marker = /dataset/ -mudrod.search.url.marker = /datasetlist? -# In order to better parse a URL (getting searching keyword, etc.), please consider custimize -# org.apache.sdap.mudrod.weblog.structure.RequestUrl - GetSearchInfo, getFilterInfo - -# User search history -mudrod.query.min = 0 -mudrod.user.history.weight = 2 - -# clickstream -mudrod.download.weight = 3 -mudrod.clickstream.svd.d = 50 -mudrod.clickstream.weight = 2 - -# metadata -mudrod.metadata.download = 0 -mudrod.metadata.download.url = https://podaac.jpl.nasa.gov/api/dataset?startIndex=$startIndex&entries=10&sortField=Dataset-AllTimePopularity&sortOrder=asc&id=&value=&search= -mudrod.metadata.svd.d = 50 -mudrod.metadata.url = null -mudrod.metadata.weight = 1 -mudrod.metadata.type = RawMetadata - -# ranking, ${svmSgdModel.value} is resolved at build time. See the property in core/pom.xml for the value -mudrod.ranking.machine.learning = 1 -mudrod.ranking.model = ${svmSgdModel.value}.zip - -# recommendation -mudrod.metadata.id = Dataset-ShortName -mudrod.metadata.semantic.fields = DatasetParameter-Term,DatasetParameter-Variable,Dataset-ExtractTerm - -# ontology service implementation. Possible values include EsipPortal - EsipPortalOntology EsipCOR - EsipCOROntology Local - org.apache.sdap.mudrod.ontology.process.Local -mudrod.ontology.implementation = Local -mudrod.ontology.weight = 2 diff --git a/ranking/src/main/resources/elastic_mappings.json b/ranking/src/main/resources/elastic_mappings.json deleted file mode 100644 index 6a0494f..0000000 --- a/ranking/src/main/resources/elastic_mappings.json +++ /dev/null @@ -1,68 +0,0 @@ -{ - "_default_": { - "properties": { - "keywords": { - "type": "text", - "analyzer": "csv", - "fielddata": true - }, - "views": { - "type": "string", - "analyzer": "csv" - }, - "downloads": { - "type": "string", - "analyzer": "csv" - }, - "RequestUrl": { - "type": "string", - "include_in_all": false, - "index": "no" - }, - "IP": { - "type": "keyword", - "index": "not_analyzed" - }, - "Browser": { - "type": "string", - "include_in_all": false, - "index": "no" - }, - "SessionURL": { - "type": "string", - "include_in_all": false, - "index": "no" - }, - "Referer": { - "type": "string", - "index": "not_analyzed" - }, - "SessionID": { - "type": "string", - "index": "not_analyzed" - }, - "Response": { - "type": "string", - "include_in_all": false, - "index": "no" - }, - "Request": { - "type": "string", - "include_in_all": false, - "index": "no" - }, - "Coordinates": { - "type": "geo_point", - "include_in_all": false, - "index": "no" - }, - "LogType": { - "type": "string", - "index": "not_analyzed" - }, - "Dataset-Metadata": { - "type": "completion" - } - } - } -} \ No newline at end of file diff --git a/ranking/src/main/resources/elastic_settings.json b/ranking/src/main/resources/elastic_settings.json deleted file mode 100644 index 05f8664..0000000 --- a/ranking/src/main/resources/elastic_settings.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "index": { - "number_of_replicas": 0, - "refresh_interval": "-1", - "number_of_shards": "5", - "translog.flush_threshold_size": "1g", - "translog.sync_interval": "30s", - "warmer.enabled": "false" - }, - "analysis": { - "filter": { - "cody_stop": { - "type": "stop", - "stopwords": "_english_" - }, - "cody_stemmer": { - "type": "stemmer", - "language": "light_english" - } - }, - "analyzer": { - "cody": { - "tokenizer": "standard", - "filter": [ - "lowercase", - "cody_stop", - "cody_stemmer" - ] - }, - "csv": { - "type": "pattern", - "pattern": "," - } - } - } -} \ No newline at end of file diff --git a/core/src/main/resources/javaSVMWithSGDModel/data/part-r-00000-e008ae03-6b61-4931-ba29-27304de5a584.gz.parquet b/ranking/src/main/resources/javaSVMWithSGDModel/data/part-r-00000-e008ae03-6b61-4931-ba29-27304de5a584.gz.parquet similarity index 100% rename from core/src/main/resources/javaSVMWithSGDModel/data/part-r-00000-e008ae03-6b61-4931-ba29-27304de5a584.gz.parquet rename to ranking/src/main/resources/javaSVMWithSGDModel/data/part-r-00000-e008ae03-6b61-4931-ba29-27304de5a584.gz.parquet diff --git a/ranking/src/main/resources/log4j.properties b/ranking/src/main/resources/log4j.properties deleted file mode 100644 index 5cbc400..0000000 --- a/ranking/src/main/resources/log4j.properties +++ /dev/null @@ -1,63 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 Unless -# -# required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" -# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -# express or implied. See the License for the specific language -# governing permissions and limitations under the License. -# Define some default values that can be overridden by system properties -# Logging Threshold -mudrod.root.logger=INFO,DRFA, stdout -mudrod.log.dir=. -mudrod.log.file=mudrod.log -log4j.threshhold=ALL -# RootLogger - DailyRollingFileAppender -log4j.rootLogger=${mudrod.root.logger} -#special logging requirements for some commandline tools -log4j.logger.MudrodEngine=INFO,cmdstdout -# -# Daily Rolling File Appender -# -log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender -log4j.appender.DRFA.File=${mudrod.log.dir}/${mudrod.log.file} -# Rollver at midnight -log4j.appender.DRFA.DatePattern=.yyyy-MM-dd -# 30-day backup -#log4j.appender.DRFA.MaxBackupIndex=30 -log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout -# Pattern format: Date LogLevel LoggerName LogMessage -log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n -# Debugging Pattern format: Date LogLevel LoggerName (FileName:MethodName:LineNo) LogMessage -#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n -# -# stdout -# Add *stdout* to rootlogger above if you want to use this -# -log4j.appender.stdout=org.apache.log4j.ConsoleAppender -log4j.appender.stdout.layout=org.apache.log4j.PatternLayout -log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n -# -# plain layout used for commandline tools to output to console -# -log4j.appender.cmdstdout=org.apache.log4j.ConsoleAppender -log4j.appender.cmdstdout.layout=org.apache.log4j.PatternLayout -log4j.appender.cmdstdout.layout.ConversionPattern=%m%n -# -# Rolling File Appender -# -#log4j.appender.RFA=org.apache.log4j.RollingFileAppender -#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file} -# Logfile size and and 30-day backups -#log4j.appender.RFA.MaxFileSize=1MB -#log4j.appender.RFA.MaxBackupIndex=30 -#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout -#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n -#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n -# Custom Logging levels -log4j.logger.akka=WARN -log4j.logger.org.apache=WARN -log4j.logger.org.apache.sdap.mudrod=INFO diff --git a/ranking/src/main/resources/log4j2.properties b/ranking/src/main/resources/log4j2.properties deleted file mode 100644 index 5cbc400..0000000 --- a/ranking/src/main/resources/log4j2.properties +++ /dev/null @@ -1,63 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 Unless -# -# required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" -# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -# express or implied. See the License for the specific language -# governing permissions and limitations under the License. -# Define some default values that can be overridden by system properties -# Logging Threshold -mudrod.root.logger=INFO,DRFA, stdout -mudrod.log.dir=. -mudrod.log.file=mudrod.log -log4j.threshhold=ALL -# RootLogger - DailyRollingFileAppender -log4j.rootLogger=${mudrod.root.logger} -#special logging requirements for some commandline tools -log4j.logger.MudrodEngine=INFO,cmdstdout -# -# Daily Rolling File Appender -# -log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender -log4j.appender.DRFA.File=${mudrod.log.dir}/${mudrod.log.file} -# Rollver at midnight -log4j.appender.DRFA.DatePattern=.yyyy-MM-dd -# 30-day backup -#log4j.appender.DRFA.MaxBackupIndex=30 -log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout -# Pattern format: Date LogLevel LoggerName LogMessage -log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n -# Debugging Pattern format: Date LogLevel LoggerName (FileName:MethodName:LineNo) LogMessage -#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n -# -# stdout -# Add *stdout* to rootlogger above if you want to use this -# -log4j.appender.stdout=org.apache.log4j.ConsoleAppender -log4j.appender.stdout.layout=org.apache.log4j.PatternLayout -log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n -# -# plain layout used for commandline tools to output to console -# -log4j.appender.cmdstdout=org.apache.log4j.ConsoleAppender -log4j.appender.cmdstdout.layout=org.apache.log4j.PatternLayout -log4j.appender.cmdstdout.layout.ConversionPattern=%m%n -# -# Rolling File Appender -# -#log4j.appender.RFA=org.apache.log4j.RollingFileAppender -#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file} -# Logfile size and and 30-day backups -#log4j.appender.RFA.MaxFileSize=1MB -#log4j.appender.RFA.MaxBackupIndex=30 -#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout -#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n -#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n -# Custom Logging levels -log4j.logger.akka=WARN -log4j.logger.org.apache=WARN -log4j.logger.org.apache.sdap.mudrod=INFO diff --git a/service/pom.xml b/service/pom.xml index 8dd59a9..4859426 100644 --- a/service/pom.xml +++ b/service/pom.xml @@ -51,6 +51,13 @@ mudrod-web ${project.version} + + + + org.apache.sdap.mudrod + mudrod-ranking + ${project.version} + diff --git a/service/src/main/java/org/apache/sdap/mudrod/services/MudrodContextListener.java b/service/src/main/java/org/apache/sdap/mudrod/services/MudrodContextListener.java index 72130d3..cd86c7e 100644 --- a/service/src/main/java/org/apache/sdap/mudrod/services/MudrodContextListener.java +++ b/service/src/main/java/org/apache/sdap/mudrod/services/MudrodContextListener.java @@ -23,8 +23,8 @@ import org.apache.sdap.mudrod.main.MudrodEngine; import org.apache.sdap.mudrod.ontology.Ontology; import org.apache.sdap.mudrod.ontology.OntologyFactory; -import org.apache.sdap.mudrod.ssearch.Ranker; -import org.apache.sdap.mudrod.ssearch.Searcher; +import org.apache.sdap.mudrod.ranking.common.Ranker; +import org.apache.sdap.mudrod.ranking.common.Searcher; import java.util.Properties; diff --git a/service/src/main/java/org/apache/sdap/mudrod/services/search/SearchMetadataResource.java b/service/src/main/java/org/apache/sdap/mudrod/services/search/SearchMetadataResource.java index 9bdf455..0ef2d1c 100644 --- a/service/src/main/java/org/apache/sdap/mudrod/services/search/SearchMetadataResource.java +++ b/service/src/main/java/org/apache/sdap/mudrod/services/search/SearchMetadataResource.java @@ -19,8 +19,8 @@ import org.apache.sdap.mudrod.main.MudrodConstants; import org.apache.sdap.mudrod.main.MudrodEngine; -import org.apache.sdap.mudrod.ssearch.Ranker; -import org.apache.sdap.mudrod.ssearch.Searcher; +import org.apache.sdap.mudrod.ranking.common.Ranker; +import org.apache.sdap.mudrod.ranking.common.Searcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; From 58e15f92d1b8bdd4f3c6a6de2ac7c6796ec08d1a Mon Sep 17 00:00:00 2001 From: quintinali Date: Thu, 28 Jun 2018 16:10:37 -0700 Subject: [PATCH 03/13] remove ranking configuration from core module --- core/pom.xml | 26 ----------------------- core/src/main/resources/config.properties | 2 +- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/core/pom.xml b/core/pom.xml index 2725e87..68c57c0 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -151,16 +151,10 @@ ${basedir}/src/main/resources true - - ${svmSgdModel.value}/** - ${project.build.directory} - - ${svmSgdModel.value}.zip - @@ -206,26 +200,6 @@ maven-assembly-plugin 2.6 - - zipSVMWithSGDModel - generate-resources - - single - - - false - posix - ${svmSgdModel.value} - ${project.build.directory} - - - - ${basedir}/src/main/assembly/zipSVMWithSGDModel.xml - - - - - generateDistribution package diff --git a/core/src/main/resources/config.properties b/core/src/main/resources/config.properties index 4c8991e..7147eac 100644 --- a/core/src/main/resources/config.properties +++ b/core/src/main/resources/config.properties @@ -61,7 +61,7 @@ mudrod.metadata.url = null mudrod.metadata.weight = 1 mudrod.metadata.type = RawMetadata -# ranking, ${svmSgdModel.value} is resolved at build time. See the property in core/pom.xml for the value +# ranking, ${svmSgdModel.value} is resolved at build time. See the property in ranking/pom.xml for the value mudrod.ranking.machine.learning = 1 mudrod.ranking.model = ${svmSgdModel.value}.zip From a44c08fce17cb5ef33ef93785aee3d3cc060ffa0 Mon Sep 17 00:00:00 2001 From: quintinali Date: Fri, 29 Jun 2018 09:44:46 -0700 Subject: [PATCH 04/13] add ignore configiration --- .gitignore | 1 + ranking/.gitignore | 1 + 2 files changed, 2 insertions(+) create mode 100644 ranking/.gitignore diff --git a/.gitignore b/.gitignore index 3009cbf..ccdad32 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ core/.externalToolBuilders/Maven_Ant_Builder.launch core/maven-eclipse.xml service/.classpath web/.classpath +/bin/ diff --git a/ranking/.gitignore b/ranking/.gitignore new file mode 100644 index 0000000..b83d222 --- /dev/null +++ b/ranking/.gitignore @@ -0,0 +1 @@ +/target/ From ce881b10f00c5082d0cdb12550cf2d0abc96f76c Mon Sep 17 00:00:00 2001 From: quintinali Date: Fri, 29 Jun 2018 09:48:13 -0700 Subject: [PATCH 05/13] refactor ranking module --- .../sdap/mudrod/ranking/common/Learner.java | 31 +------ .../sdap/mudrod/ranking/dlrank/DLLearner.java | 83 +++++++++++++++++++ .../mudrod/ranking/dlrank/SparkFormatter.java | 55 ------------ .../sdap/mudrod/ranking/dlrank/SparkSVM.java | 49 ----------- .../ClickstreamImporter.java | 2 +- .../{common => evaluate}/Evaluator.java | 2 +- .../TrainingImporter.java | 2 +- .../mudrod/ranking/ranksvm/SVMLearner.java | 25 +++--- ...pertData.java => ExpertRankTrainData.java} | 12 ++- .../traindata/RankTrainDataFactory.java | 76 +++++++++++++++++ 10 files changed, 189 insertions(+), 148 deletions(-) create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java delete mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkFormatter.java delete mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkSVM.java rename ranking/src/main/java/org/apache/sdap/mudrod/ranking/{traindata => evaluate}/ClickstreamImporter.java (98%) rename ranking/src/main/java/org/apache/sdap/mudrod/ranking/{common => evaluate}/Evaluator.java (98%) rename ranking/src/main/java/org/apache/sdap/mudrod/ranking/{traindata => evaluate}/TrainingImporter.java (98%) rename ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/{ExpertData.java => ExpertRankTrainData.java} (97%) create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java index 315b8bd..1b2505d 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java @@ -16,8 +16,7 @@ import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; import org.apache.sdap.mudrod.driver.ESDriver; import org.apache.sdap.mudrod.driver.SparkDriver; -import org.apache.sdap.mudrod.ranking.traindata.ExpertData; -import org.apache.sdap.mudrod.ranking.traindata.TrainingImporter; +import org.apache.sdap.mudrod.ranking.traindata.ExpertRankTrainData; import org.apache.spark.SparkContext; import org.apache.spark.mllib.classification.SVMModel; import org.apache.spark.mllib.regression.LabeledPoint; @@ -37,32 +36,6 @@ public Learner(Properties props, ESDriver es, SparkDriver spark) { // TODO Auto-generated constructor stub } - /** - * Constructor to load in spark SVM classifier - * - * @param classifierName - * classifier type - * @param skd - * an instance of spark driver - * @param svmSgdModel - * path to a trained model - */ - - public String extractTrainDataFromExperts(String sourceDir){ - File sourceFile = new File(sourceDir); - boolean bDir = sourceFile.isDirectory(); - boolean multFiles = false; - if(bDir){ - multFiles = true; - } - - String resultDir = sourceFile.getParent() + "/trainsets.txt"; - ExpertData converter = new ExpertData(sourceDir, resultDir, true); - converter.convert2TrainSet(); - - return resultDir; - } - /** * Method of classifying instance * @@ -72,7 +45,7 @@ public String extractTrainDataFromExperts(String sourceDir){ */ //public abstract double classify(LabeledPoint p); - public abstract String prepareTrainData(String sourceDir); + public abstract String customizeTrainData(String sourceDir); public abstract void train(String trainFile); diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java new file mode 100644 index 0000000..4561081 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.dlrank; + +import java.io.File; +import java.util.Properties; + +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.main.MudrodEngine; +import org.apache.sdap.mudrod.ranking.common.Learner; +import org.apache.sdap.mudrod.ranking.common.LearnerFactory; +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.classification.SVMModel; +import org.apache.spark.mllib.classification.SVMWithSGD; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; + +/** + * Supports the ability to importing classifier into memory + */ +public class DLLearner extends Learner { + /** + * + */ + private static final long serialVersionUID = 1L; + transient SparkContext sc = null; + + /** + * Constructor to load in spark SVM classifier + * + * @param classifierName + * classifier type + * @param skd + * an instance of spark driver + * @param svmSgdModel + * path to a trained model + */ + public DLLearner(Properties props, ESDriver es, SparkDriver spark, String dlModel) { + super(props, es, spark); + + sc = spark.sc.sc(); + load(dlModel); + } + + @Override + public void train(String trainFile) { + + } + + @Override + public double predict(double[] value) { + return 0; + } + + @Override + public void save() { + } + + @Override + public void load(String svmSgdModel) { + } + + @Override + public String customizeTrainData(String sourceDir) { + // TODO Auto-generated method stub + return null; + } +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkFormatter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkFormatter.java deleted file mode 100644 index 356365b..0000000 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkFormatter.java +++ /dev/null @@ -1,55 +0,0 @@ -package org.apache.sdap.mudrod.ranking.dlrank; - -import java.io.*; -import java.text.DecimalFormat; - -public class SparkFormatter { - DecimalFormat NDForm = new DecimalFormat("#.###"); - - public SparkFormatter() { - } - - public void toSparkSVMformat(String inputCSVFileName, String outputTXTFileName) { - File file = new File(outputTXTFileName); - if (file.exists()) { - file.delete(); - } - try { - file.createNewFile(); - FileWriter fw = new FileWriter(outputTXTFileName); - BufferedWriter bw = new BufferedWriter(fw); - - BufferedReader br = new BufferedReader(new FileReader(inputCSVFileName)); - br.readLine(); - String line = br.readLine(); - while (line != null) { - String[] list = line.split(","); - String output = ""; - Double label = Double.parseDouble(list[list.length - 1].replace("\"", "")); - if (label == -1.0) { - output = "0 "; - } else if (label == 1.0) { - output = "1 "; - } - - for (int i = 0; i < list.length - 1; i++) { - int index = i + 1; - output += index + ":" + NDForm.format(Double.parseDouble(list[i].replace("\"", ""))) + " "; - } - bw.write(output + "\n"); - - line = br.readLine(); - } - br.close(); - bw.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public static void main(String[] args) { - SparkFormatter sf = new SparkFormatter(); - sf.toSparkSVMformat("C:/mudrodCoreTestData/rankingResults/inputDataForSVM.csv", "C:/mudrodCoreTestData/rankingResults/inputDataForSVM_spark.txt"); - } - -} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkSVM.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkSVM.java deleted file mode 100644 index 388c632..0000000 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/SparkSVM.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ranking.dlrank; - -import org.apache.sdap.mudrod.main.MudrodEngine; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.classification.SVMModel; -import org.apache.spark.mllib.classification.SVMWithSGD; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; - -public class SparkSVM { - - private SparkSVM() { - //public constructor - } - - public static void main(String[] args) { - MudrodEngine me = new MudrodEngine(); - - JavaSparkContext jsc = me.startSparkDriver().sc; - - String path = SparkSVM.class.getClassLoader().getResource("inputDataForSVM_spark.txt").toString(); - JavaRDD data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); - - // Run training algorithm to build the model. - int numIterations = 100; - final SVMModel model = SVMWithSGD.train(data.rdd(), numIterations); - - // Save and load model - model.save(jsc.sc(), SparkSVM.class.getClassLoader().getResource("javaSVMWithSGDModel").toString()); - - jsc.sc().stop(); - - } - -} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ClickstreamImporter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/ClickstreamImporter.java similarity index 98% rename from ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ClickstreamImporter.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/ClickstreamImporter.java index 5b85b29..a75061f 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ClickstreamImporter.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/ClickstreamImporter.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.sdap.mudrod.ranking.traindata; +package org.apache.sdap.mudrod.ranking.evaluate; import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; import org.apache.sdap.mudrod.driver.ESDriver; diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Evaluator.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/Evaluator.java similarity index 98% rename from ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Evaluator.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/Evaluator.java index c84c537..0c4e3fd 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Evaluator.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/Evaluator.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.sdap.mudrod.ranking.common; +package org.apache.sdap.mudrod.ranking.evaluate; import java.util.Collections; import java.util.Comparator; diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/TrainingImporter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/TrainingImporter.java similarity index 98% rename from ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/TrainingImporter.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/TrainingImporter.java index a1f1dff..5cae4a1 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/TrainingImporter.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/TrainingImporter.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.sdap.mudrod.ranking.traindata; +package org.apache.sdap.mudrod.ranking.evaluate; import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; import org.apache.sdap.mudrod.driver.ESDriver; diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java index 875ef45..6b1939f 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java @@ -14,6 +14,11 @@ package org.apache.sdap.mudrod.ranking.ranksvm; import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Comparator; import java.util.Properties; import org.apache.sdap.mudrod.driver.ESDriver; @@ -21,6 +26,7 @@ import org.apache.sdap.mudrod.main.MudrodEngine; import org.apache.sdap.mudrod.ranking.common.Learner; import org.apache.sdap.mudrod.ranking.common.LearnerFactory; +import org.apache.sdap.mudrod.ranking.traindata.RankTrainDataFactory; import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -58,12 +64,13 @@ public SVMLearner(Properties props, ESDriver es, SparkDriver spark, String svmSg load(svmSgdModel); } - public String prepareTrainData(String sourceDir) { + public String customizeTrainData(String sourceDir) { //add other source, such as log, streaming - String resultFile = this.extractTrainDataFromExperts(sourceDir); + //String resultFile = this.extractTrainDataFromExperts(sourceDir); + RankTrainDataFactory factory = new RankTrainDataFactory(props, es, spark); + String resultFile = factory.createTrainData(sourceDir); - //String path = SVMLearner.class.getClassLoader().getResource("trainsets").toString(); String path = new File(resultFile).getParent(); String svmSparkFile = path + "/inputDataForSVM_spark.txt"; SparkFormatter sf = new SparkFormatter(); @@ -74,9 +81,7 @@ public String prepareTrainData(String sourceDir) { @Override public void train(String trainFile) { - //String path = SVMLearner.class.getClassLoader().getResource("inputDataForSVM_spark.txt").toString(); JavaRDD data = MLUtils.loadLibSVMFile(sc, trainFile).toJavaRDD(); - // Run training algorithm to build the model. int numIterations = 100; model = SVMWithSGD.train(data.rdd(), numIterations); @@ -90,14 +95,14 @@ public double predict(double[] value) { @Override public void save() { - // Save and load model + // Save model String modelPath = SVMLearner.class.getClassLoader().getResource("javaSVMWithSGDModel").toString(); model.save(sc, modelPath); } @Override public void load(String svmSgdModel) { - // TODO Auto-generated method stub + // load model sc.addFile(svmSgdModel, true); model = SVMModel.load(sc, svmSgdModel); } @@ -106,14 +111,14 @@ public static void main(String[] arg0) { MudrodEngine me = new MudrodEngine(); Properties props = me.loadConfig(); - SparkDriver spark = new SparkDriver(me.getConfig()); - ESDriver es = new ESDriver(me.getConfig()); + SparkDriver spark = new SparkDriver(props); + ESDriver es = new ESDriver(props); LearnerFactory factory = new LearnerFactory(props, es, spark); Learner le = factory.createLearner(); String sourceDir = "E://data//mudrod//ranking//rankingResults//training//training_data_v4"; - String trainFile = le.prepareTrainData(sourceDir); + String trainFile = le.customizeTrainData(sourceDir); le.train(trainFile); le.save(); } diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertData.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertRankTrainData.java similarity index 97% rename from ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertData.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertRankTrainData.java index 98ba6bb..3766c2d 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertData.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertRankTrainData.java @@ -28,7 +28,7 @@ * the space of query-document vectors, e.g. x1, x2, x3, we transform them into a new space * in which a pair of documents is represented as the difference between their feature vectors. */ -public class ExpertData { +public class ExpertRankTrainData { private static String mySourceDir; private static String myResultDir; private static boolean isMultFiles; @@ -59,11 +59,19 @@ public class ExpertData { * @param multFiles true if multiple files in directory need to be processed and false if * only a single file needs to be processed */ - public ExpertData(String sourceDir, String resultDir, boolean multFiles) { + public ExpertRankTrainData(String sourceDir, String resultDir, boolean multFiles) { mySourceDir = sourceDir; myResultDir = resultDir; isMultFiles = multFiles; } + + /** + * prepare data for experts to evaluate. + */ + public void prepareDataForExperts() { + parseFile(); + writeCSVfile(myMasterList); + } /** * Responsible for invoking the processing of data file(s) and their subsequent storage diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java new file mode 100644 index 0000000..b59ae49 --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java @@ -0,0 +1,76 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.traindata; + +import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.main.MudrodConstants; +import org.apache.sdap.mudrod.ranking.ranksvm.SVMLearner; +import org.apache.sdap.mudrod.weblog.structure.SessionExtractor; +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.classification.SVMModel; +import org.apache.spark.mllib.regression.LabeledPoint; + +import java.io.File; +import java.io.Serializable; +import java.util.Properties; + +/** + * Supports the ability to importing classifier into memory + */ +public class RankTrainDataFactory extends MudrodAbstract{ + + public RankTrainDataFactory(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + // TODO Auto-generated constructor stub + } + + /** + * Method of classifying instance + * + * @param p + * the instance that needs to be classified + * @return the class id + */ + + public String createTrainData(String sourceDir) { + + return createTrainDataFromExperts(sourceDir); + + } + + public String createTrainDataFromExperts(String sourceDir) { + File sourceFile = new File(sourceDir); + boolean bDir = sourceFile.isDirectory(); + boolean multFiles = false; + if (bDir) { + multFiles = true; + } + + String resultDir = sourceFile.getParent() + "/trainsets.txt"; + ExpertRankTrainData converter = new ExpertRankTrainData(sourceDir, resultDir, true); + converter.convert2TrainSet(); + + return resultDir; + } + + // start session + // mode: overwrite or append + public String createTrainDataFromOfflineLogs(String trainsetFile, int start, int mode) { + + return ""; + } +} From eb711c7501e764898ebd3f10a0e1e92e8f15a159 Mon Sep 17 00:00:00 2001 From: quintinali Date: Fri, 29 Jun 2018 11:53:00 -0700 Subject: [PATCH 06/13] modify pom.xml --- core/pom.xml | 2 +- ranking/.classpath | 37 ++++++++++++++ ranking/.gitignore | 2 + ranking/pom.xml | 2 +- ranking/src/main/assembly/bin.xml | 50 +++++++++++++++++++ .../traindata/RankTrainDataFactory.java | 1 - 6 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 ranking/.classpath create mode 100644 ranking/src/main/assembly/bin.xml diff --git a/core/pom.xml b/core/pom.xml index 68c57c0..baa9bb3 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -18,7 +18,7 @@ 4.0.0 - org.apache.sdap.mudrod + org.apache.sdap mudrod-parent 0.0.1-SNAPSHOT ../ diff --git a/ranking/.classpath b/ranking/.classpath new file mode 100644 index 0000000..c2fd87a --- /dev/null +++ b/ranking/.classpath @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ranking/.gitignore b/ranking/.gitignore index b83d222..ca94514 100644 --- a/ranking/.gitignore +++ b/ranking/.gitignore @@ -1 +1,3 @@ /target/ +/bin/ +/lib/ diff --git a/ranking/pom.xml b/ranking/pom.xml index 490738b..178357d 100644 --- a/ranking/pom.xml +++ b/ranking/pom.xml @@ -11,7 +11,7 @@ 4.0.0 - org.apache.sdap.mudrod + org.apache.sdap mudrod-parent 0.0.1-SNAPSHOT ../ diff --git a/ranking/src/main/assembly/bin.xml b/ranking/src/main/assembly/bin.xml new file mode 100644 index 0000000..7977d65 --- /dev/null +++ b/ranking/src/main/assembly/bin.xml @@ -0,0 +1,50 @@ + + + + bin + + tar.gz + zip + + + + target/appassembler/bin + bin + + *.bat + + unix + 0755 + 0644 + + + target/appassembler/bin + bin + + *.bat + + dos + 0755 + 0644 + + + target/appassembler/lib + lib + + + \ No newline at end of file diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java index b59ae49..eb735ad 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java @@ -18,7 +18,6 @@ import org.apache.sdap.mudrod.driver.SparkDriver; import org.apache.sdap.mudrod.main.MudrodConstants; import org.apache.sdap.mudrod.ranking.ranksvm.SVMLearner; -import org.apache.sdap.mudrod.weblog.structure.SessionExtractor; import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.mllib.classification.SVMModel; From ae6b2c31f9674900af04a1b4dc59e4ea503a88f3 Mon Sep 17 00:00:00 2001 From: quintinali Date: Fri, 29 Jun 2018 13:14:37 -0700 Subject: [PATCH 07/13] delete useless added files --- .gitignore | 3 +- core/lib/all-1.1.2.pom | 102 ---------------------------- core/lib/apache-jena-libs-3.1.0.pom | 51 -------------- ranking/.classpath | 37 ---------- 4 files changed, 2 insertions(+), 191 deletions(-) delete mode 100644 core/lib/all-1.1.2.pom delete mode 100644 core/lib/apache-jena-libs-3.1.0.pom delete mode 100644 ranking/.classpath diff --git a/.gitignore b/.gitignore index ccdad32..10c0fda 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ core/.externalToolBuilders/Maven_Ant_Builder.launch core/maven-eclipse.xml service/.classpath web/.classpath -/bin/ +-web/.externalToolBuilders/ +-web/maven-eclipse.xml diff --git a/core/lib/all-1.1.2.pom b/core/lib/all-1.1.2.pom deleted file mode 100644 index 577dd32..0000000 --- a/core/lib/all-1.1.2.pom +++ /dev/null @@ -1,102 +0,0 @@ - - - 4.0.0 - - - com.github.fommil.netlib - parent - 1.1 - - - all - pom - 1.1.2 - - - - - net.sourceforge.f2j - arpack_combined_all - 0.1 - - - ${project.parent.groupId} - core - 1.1.2 - - - ${project.parent.groupId} - netlib-native_ref-osx-x86_64 - ${project.parent.version} - natives - - - ${project.parent.groupId} - netlib-native_ref-linux-x86_64 - ${project.parent.version} - natives - - - ${project.parent.groupId} - netlib-native_ref-linux-i686 - ${project.parent.version} - natives - - - ${project.parent.groupId} - netlib-native_ref-win-x86_64 - ${project.parent.version} - natives - - - ${project.parent.groupId} - netlib-native_ref-win-i686 - ${project.parent.version} - natives - - - ${project.parent.groupId} - netlib-native_ref-linux-armhf - ${project.parent.version} - natives - - - ${project.parent.groupId} - netlib-native_system-osx-x86_64 - ${project.parent.version} - natives - - - ${project.parent.groupId} - netlib-native_system-linux-x86_64 - ${project.parent.version} - natives - - - ${project.parent.groupId} - netlib-native_system-linux-i686 - ${project.parent.version} - natives - - - ${project.parent.groupId} - netlib-native_system-linux-armhf - ${project.parent.version} - natives - - - ${project.parent.groupId} - netlib-native_system-win-x86_64 - ${project.parent.version} - natives - - - ${project.parent.groupId} - netlib-native_system-win-i686 - ${project.parent.version} - natives - - - diff --git a/core/lib/apache-jena-libs-3.1.0.pom b/core/lib/apache-jena-libs-3.1.0.pom deleted file mode 100644 index 3fdb874..0000000 --- a/core/lib/apache-jena-libs-3.1.0.pom +++ /dev/null @@ -1,51 +0,0 @@ - - - - - 4.0.0 - apache-jena-libs - pom - Apache Jena - Libraries POM - 3.1.0 - - - org.apache.jena - jena-parent - 16 - ../jena-parent - - - - - - apache.snapshots - Apache Snapshot Repository - http://repository.apache.org/snapshots - - false - - - - - A convenience POM artifact that may be referenced to pull in all the standard Jena Libraries (Core, ARQ, IRI, and TDB) with a single dependency. - - - - - org.apache.jena - jena-tdb - 3.1.0 - - - - - diff --git a/ranking/.classpath b/ranking/.classpath deleted file mode 100644 index c2fd87a..0000000 --- a/ranking/.classpath +++ /dev/null @@ -1,37 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - From 050fbe866f7f15aef278faea123c648094da6d10 Mon Sep 17 00:00:00 2001 From: quintinali Date: Fri, 29 Jun 2018 13:15:11 -0700 Subject: [PATCH 08/13] delete useless added files --- vim.exe.stackdump | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 vim.exe.stackdump diff --git a/vim.exe.stackdump b/vim.exe.stackdump deleted file mode 100644 index a4373c2..0000000 --- a/vim.exe.stackdump +++ /dev/null @@ -1,5 +0,0 @@ -Stack trace: -Frame Function Args -2003FDA8 6101E3DA (0000032C, 0000EA60, 000000A4, 2003FE18) -2003FED8 610E5166 (00000001, 00000000, 0000000D, 0056338F) -2004003C 610E1C2B (00000000, 00000000, 00000000, 00000000) From 02abc70cc05d8ecf8af64ec1af10cce088ef5ea8 Mon Sep 17 00:00:00 2001 From: quintinali Date: Fri, 29 Jun 2018 13:17:20 -0700 Subject: [PATCH 09/13] fixed bug --- .gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 10c0fda..a8ff594 100644 --- a/.gitignore +++ b/.gitignore @@ -15,5 +15,5 @@ core/.externalToolBuilders/Maven_Ant_Builder.launch core/maven-eclipse.xml service/.classpath web/.classpath --web/.externalToolBuilders/ --web/maven-eclipse.xml +web/.externalToolBuilders/ +web/maven-eclipse.xml From 56ae474190bb9de7034d69199780c05eff84efc1 Mon Sep 17 00:00:00 2001 From: quintinali Date: Tue, 3 Jul 2018 08:22:01 -0700 Subject: [PATCH 10/13] fixed bugs and modify code as request --- ranking/pom.xml | 19 +- ranking/src/main/assembly/bin.xml | 50 ------ .../sdap/mudrod/ranking/common/Learner.java | 14 +- .../mudrod/ranking/common/LearnerFactory.java | 27 +-- .../sdap/mudrod/ranking/dlrank/DLLearner.java | 65 +++---- .../mudrod/ranking/ranksvm/SVMLearner.java | 165 +++++++++--------- .../ranking/ranksvm/SparkFormatter.java | 26 ++- .../traindata/ExpertRankTrainData.java | 2 +- .../traindata/RankTrainDataFactory.java | 16 +- service/pom.xml | 2 +- 10 files changed, 140 insertions(+), 246 deletions(-) delete mode 100644 ranking/src/main/assembly/bin.xml diff --git a/ranking/pom.xml b/ranking/pom.xml index 178357d..591fda2 100644 --- a/ranking/pom.xml +++ b/ranking/pom.xml @@ -31,7 +31,7 @@ - org.apache.sdap.mudrod + org.apache.sdap mudrod-core ${project.version} @@ -116,23 +116,6 @@ - - - generateDistribution - package - - single - - - false - posix - - - ${basedir}/src/main/assembly/bin.xml - - - - diff --git a/ranking/src/main/assembly/bin.xml b/ranking/src/main/assembly/bin.xml deleted file mode 100644 index 7977d65..0000000 --- a/ranking/src/main/assembly/bin.xml +++ /dev/null @@ -1,50 +0,0 @@ - - - - bin - - tar.gz - zip - - - - target/appassembler/bin - bin - - *.bat - - unix - 0755 - 0644 - - - target/appassembler/bin - bin - - *.bat - - dos - 0755 - 0644 - - - target/appassembler/lib - lib - - - \ No newline at end of file diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java index 1b2505d..c2d78d8 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java @@ -27,24 +27,14 @@ import java.util.Properties; /** - * Supports the ability to importing classifier into memory + * learn rank weights from train data and predict search results ranking */ public abstract class Learner extends MudrodAbstract { - public Learner(Properties props, ESDriver es, SparkDriver spark) { + public Learner(Properties props, ESDriver es, SparkDriver spark) { super(props, es, spark); - // TODO Auto-generated constructor stub } - /** - * Method of classifying instance - * - * @param p - * the instance that needs to be classified - * @return the class id - */ - //public abstract double classify(LabeledPoint p); - public abstract String customizeTrainData(String sourceDir); public abstract void train(String trainFile); diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java index 1aa0fdf..4da1b49 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java @@ -26,29 +26,18 @@ import java.util.Properties; /** - * Supports the ability to importing classifier into memory + * Create a learner due to configuration */ public class LearnerFactory extends MudrodAbstract { public LearnerFactory(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - // TODO Auto-generated constructor stub - } - -/** - * Method of classifying instance - * - * @param p the instance that needs to be classified - * @return the class id - */ - - public Learner createLearner(){ - if("1".equals(props.getProperty(MudrodConstants.RANKING_ML))) - return new SVMLearner(props, es, spark, props.getProperty(MudrodConstants.RANKING_MODEL)); - - return null; - + super(props, es, spark); } - + public Learner createLearner() { + if ("1".equals(props.getProperty(MudrodConstants.RANKING_ML))) + return new SVMLearner(props, es, spark, props.getProperty(MudrodConstants.RANKING_MODEL)); + + return null; + } } diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java index 4561081..b5fd274 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java @@ -30,54 +30,37 @@ import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; -/** - * Supports the ability to importing classifier into memory - */ public class DLLearner extends Learner { - /** - * - */ - private static final long serialVersionUID = 1L; - transient SparkContext sc = null; + private static final long serialVersionUID = 1L; + transient SparkContext sc = null; - /** - * Constructor to load in spark SVM classifier - * - * @param classifierName - * classifier type - * @param skd - * an instance of spark driver - * @param svmSgdModel - * path to a trained model - */ - public DLLearner(Properties props, ESDriver es, SparkDriver spark, String dlModel) { - super(props, es, spark); + public DLLearner(Properties props, ESDriver es, SparkDriver spark, String dlModel) { + super(props, es, spark); - sc = spark.sc.sc(); - load(dlModel); - } + sc = spark.sc.sc(); + load(dlModel); + } - @Override - public void train(String trainFile) { + @Override + public void train(String trainFile) { - } + } - @Override - public double predict(double[] value) { - return 0; - } + @Override + public double predict(double[] value) { + return 0; + } - @Override - public void save() { - } + @Override + public void save() { + } - @Override - public void load(String svmSgdModel) { - } + @Override + public void load(String svmSgdModel) { + } - @Override - public String customizeTrainData(String sourceDir) { - // TODO Auto-generated method stub - return null; - } + @Override + public String customizeTrainData(String sourceDir) { + return null; + } } diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java index 6b1939f..36eb8d2 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java @@ -37,89 +37,88 @@ import org.apache.spark.mllib.util.MLUtils; /** - * Supports the ability to importing classifier into memory + * Learn ranking weights with SVM model */ public class SVMLearner extends Learner { - /** - * - */ - private static final long serialVersionUID = 1L; - SVMModel model = null; - transient SparkContext sc = null; - - /** - * Constructor to load in spark SVM classifier - * - * @param classifierName - * classifier type - * @param skd - * an instance of spark driver - * @param svmSgdModel - * path to a trained model - */ - public SVMLearner(Properties props, ESDriver es, SparkDriver spark, String svmSgdModel) { - super(props, es, spark); - - sc = spark.sc.sc(); - load(svmSgdModel); - } - - public String customizeTrainData(String sourceDir) { - - //add other source, such as log, streaming - //String resultFile = this.extractTrainDataFromExperts(sourceDir); - RankTrainDataFactory factory = new RankTrainDataFactory(props, es, spark); - String resultFile = factory.createTrainData(sourceDir); - - String path = new File(resultFile).getParent(); - String svmSparkFile = path + "/inputDataForSVM_spark.txt"; - SparkFormatter sf = new SparkFormatter(); - sf.toSparkSVMformat(resultFile,svmSparkFile); - - return svmSparkFile; - } - - @Override - public void train(String trainFile) { - JavaRDD data = MLUtils.loadLibSVMFile(sc, trainFile).toJavaRDD(); - // Run training algorithm to build the model. - int numIterations = 100; - model = SVMWithSGD.train(data.rdd(), numIterations); - } - - @Override - public double predict(double[] value) { - LabeledPoint p = new LabeledPoint(99.0, Vectors.dense(value)); - return model.predict(p.features()); - } - - @Override - public void save() { - // Save model - String modelPath = SVMLearner.class.getClassLoader().getResource("javaSVMWithSGDModel").toString(); - model.save(sc, modelPath); - } - - @Override - public void load(String svmSgdModel) { - // load model - sc.addFile(svmSgdModel, true); - model = SVMModel.load(sc, svmSgdModel); - } - - public static void main(String[] arg0) { - MudrodEngine me = new MudrodEngine(); - Properties props = me.loadConfig(); - - SparkDriver spark = new SparkDriver(props); - ESDriver es = new ESDriver(props); - - LearnerFactory factory = new LearnerFactory(props, es, spark); - Learner le = factory.createLearner(); - - String sourceDir = "E://data//mudrod//ranking//rankingResults//training//training_data_v4"; - String trainFile = le.customizeTrainData(sourceDir); - le.train(trainFile); - le.save(); - } + /** + * + */ + private static final long serialVersionUID = 1L; + SVMModel model = null; + transient SparkContext sc = null; + + /** + * Constructor to load in spark SVM classifier + * + * @param classifierName + * classifier type + * @param skd + * an instance of spark driver + * @param svmSgdModel + * path to a trained model + */ + public SVMLearner(Properties props, ESDriver es, SparkDriver spark, String svmSgdModel) { + super(props, es, spark); + + sc = spark.sc.sc(); + load(svmSgdModel); + } + + public String customizeTrainData(String sourceDir) { + RankTrainDataFactory factory = new RankTrainDataFactory(props, es, spark); + String resultFile = factory.createTrainData(sourceDir); + + String path = new File(resultFile).getParent(); + + String separator = System.getProperty("file.separator"); + String svmSparkFile = path + separator + "inputDataForSVM_spark.txt"; + SparkFormatter sf = new SparkFormatter(); + sf.toSparkSVMformat(resultFile, svmSparkFile); + + return svmSparkFile; + } + + @Override + public void train(String trainFile) { + JavaRDD data = MLUtils.loadLibSVMFile(sc, trainFile).toJavaRDD(); + // Run training algorithm to build the model. + int numIterations = 100; + model = SVMWithSGD.train(data.rdd(), numIterations); + } + + @Override + public double predict(double[] value) { + LabeledPoint p = new LabeledPoint(99.0, Vectors.dense(value)); + return model.predict(p.features()); + } + + @Override + public void save() { + // Save model + String modelPath = SVMLearner.class.getClassLoader().getResource("javaSVMWithSGDModel").toString(); + model.save(sc, modelPath); + } + + @Override + public void load(String svmSgdModel) { + // load model + sc.addFile(svmSgdModel, true); + model = SVMModel.load(sc, svmSgdModel); + } + + public static void main(String[] arg0) { + MudrodEngine me = new MudrodEngine(); + Properties props = me.loadConfig(); + + SparkDriver spark = new SparkDriver(props); + ESDriver es = new ESDriver(props); + + LearnerFactory factory = new LearnerFactory(props, es, spark); + Learner le = factory.createLearner(); + + String sourceDir = "E://data//mudrod//ranking//rankingResults//training//training_data_v4"; + String trainFile = le.customizeTrainData(sourceDir); + le.train(trainFile); + //le.save(); + } } diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkFormatter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkFormatter.java index 8fe2acd..413ab66 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkFormatter.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkFormatter.java @@ -1,6 +1,24 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.sdap.mudrod.ranking.ranksvm; -import java.io.*; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; import java.text.DecimalFormat; public class SparkFormatter { @@ -46,10 +64,4 @@ public void toSparkSVMformat(String inputCSVFileName, String outputTXTFileName) e.printStackTrace(); } } - - public static void main(String[] args) { - SparkFormatter sf = new SparkFormatter(); - sf.toSparkSVMformat("C:/mudrodCoreTestData/rankingResults/inputDataForSVM.csv", "C:/mudrodCoreTestData/rankingResults/inputDataForSVM_spark.txt"); - } - } diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertRankTrainData.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertRankTrainData.java index 3766c2d..d0f8102 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertRankTrainData.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertRankTrainData.java @@ -77,7 +77,7 @@ public void prepareDataForExperts() { * Responsible for invoking the processing of data file(s) and their subsequent storage * into a user specified directory. */ - public void convert2TrainSet() { + public void convertToTrainSet() { parseFile(); writeCSVfile(myMasterList); } diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java index eb735ad..6594d20 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java @@ -28,23 +28,14 @@ import java.util.Properties; /** - * Supports the ability to importing classifier into memory + * Create train data from difference sources, including experts provided data, offline logs and realtime logs */ public class RankTrainDataFactory extends MudrodAbstract{ public RankTrainDataFactory(Properties props, ESDriver es, SparkDriver spark) { super(props, es, spark); - // TODO Auto-generated constructor stub } - /** - * Method of classifying instance - * - * @param p - * the instance that needs to be classified - * @return the class id - */ - public String createTrainData(String sourceDir) { return createTrainDataFromExperts(sourceDir); @@ -61,15 +52,12 @@ public String createTrainDataFromExperts(String sourceDir) { String resultDir = sourceFile.getParent() + "/trainsets.txt"; ExpertRankTrainData converter = new ExpertRankTrainData(sourceDir, resultDir, true); - converter.convert2TrainSet(); + converter.convertToTrainSet(); return resultDir; } - // start session - // mode: overwrite or append public String createTrainDataFromOfflineLogs(String trainsetFile, int start, int mode) { - return ""; } } diff --git a/service/pom.xml b/service/pom.xml index a61b68f..6e364be 100644 --- a/service/pom.xml +++ b/service/pom.xml @@ -54,7 +54,7 @@ - org.apache.sdap.mudrod + org.apache.sdap mudrod-ranking ${project.version} From 3d8debef9dbeaf355fa377bbad7118da0c77b5f0 Mon Sep 17 00:00:00 2001 From: quintinali Date: Tue, 3 Jul 2018 08:22:24 -0700 Subject: [PATCH 11/13] fixed bugs and modify code as request --- .../mudrod/ranking/evaluate/ClickstreamImporter.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/ClickstreamImporter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/ClickstreamImporter.java index a75061f..0620d98 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/ClickstreamImporter.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/evaluate/ClickstreamImporter.java @@ -32,13 +32,13 @@ * Supports ability to import click stream data into Elasticsearch * through .csv file */ -public class ClickstreamImporter extends MudrodAbstract { +public class ClickStreamImporter extends MudrodAbstract { /** * */ private static final long serialVersionUID = 1L; - public ClickstreamImporter(Properties props, ESDriver es, SparkDriver spark) { + public ClickStreamImporter(Properties props, ESDriver es, SparkDriver spark) { super(props, es, spark); addClickStreamMapping(); } @@ -47,9 +47,9 @@ public ClickstreamImporter(Properties props, ESDriver es, SparkDriver spark) { * Method to add Elasticsearch mapping for click stream data */ public void addClickStreamMapping() { - XContentBuilder Mapping; + XContentBuilder mapping; try { - Mapping = jsonBuilder().startObject().startObject( + mapping = jsonBuilder().startObject().startObject( props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)).startObject( "properties").startObject("query").field("type", "string").field( "index", "not_analyzed").endObject().startObject("dataID").field( @@ -60,7 +60,7 @@ public void addClickStreamMapping() { es.getClient().admin().indices().preparePutMapping( props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType( props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)).setSource( - Mapping).execute().actionGet(); + mapping).execute().actionGet(); } catch (IOException e) { e.printStackTrace(); } From 3bb1f1b2ca9e160c64fbedbf36d017d862c884a7 Mon Sep 17 00:00:00 2001 From: quintinali Date: Tue, 3 Jul 2018 09:35:19 -0700 Subject: [PATCH 12/13] delete hard code and create filewritter with try-catch-resource --- .../apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java | 4 ++-- .../sdap/mudrod/ranking/ranksvm/SparkFormatter.java | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java index 36eb8d2..2687482 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java @@ -116,9 +116,9 @@ public static void main(String[] arg0) { LearnerFactory factory = new LearnerFactory(props, es, spark); Learner le = factory.createLearner(); - String sourceDir = "E://data//mudrod//ranking//rankingResults//training//training_data_v4"; + String sourceDir = arg0[0]; String trainFile = le.customizeTrainData(sourceDir); le.train(trainFile); - //le.save(); + le.save(); } } diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkFormatter.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkFormatter.java index 413ab66..2562a0f 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkFormatter.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkFormatter.java @@ -32,12 +32,10 @@ public void toSparkSVMformat(String inputCSVFileName, String outputTXTFileName) if (file.exists()) { file.delete(); } - try { + try (FileWriter fw = new FileWriter(outputTXTFileName); BufferedWriter bw = new BufferedWriter(fw); BufferedReader br = new BufferedReader(new FileReader(inputCSVFileName));) { + file.createNewFile(); - FileWriter fw = new FileWriter(outputTXTFileName); - BufferedWriter bw = new BufferedWriter(fw); - BufferedReader br = new BufferedReader(new FileReader(inputCSVFileName)); br.readLine(); String line = br.readLine(); while (line != null) { @@ -58,8 +56,7 @@ public void toSparkSVMformat(String inputCSVFileName, String outputTXTFileName) line = br.readLine(); } - br.close(); - bw.close(); + } catch (IOException e) { e.printStackTrace(); } From 023a971081e2521bd3e238e861b915ab3d28a313 Mon Sep 17 00:00:00 2001 From: quintinali Date: Wed, 1 Aug 2018 10:22:08 -0700 Subject: [PATCH 13/13] improve deep learning-based ranking --- .../weblog/pre/RankingTrainDataGenerator.java | 54 ----- .../weblog/structure/session/Session.java | 29 +-- .../structure/session/SessionExtractor.java | 100 +-------- .../weblog/structure/session/SessionTree.java | 82 +------ ranking/pom.xml | 17 ++ .../mudrod/ranking/common/LearnerFactory.java | 11 +- .../common/{Learner.java => RankLearner.java} | 8 +- .../sdap/mudrod/ranking/common/Ranker.java | 2 +- .../sdap/mudrod/ranking/dlrank/DLLearner.java | 66 ------ .../mudrod/ranking/dlrank/DLRankLearner.java | 212 ++++++++++++++++++ .../mudrod/ranking/dlrank/ND4JFormatter.java | 64 ++++++ .../{SVMLearner.java => RankSVMLearner.java} | 58 ++++- .../sdap/mudrod/ranking/ranksvm/SparkSVM.java | 47 ---- .../traindata/ExpertRankTrainData.java | 2 +- .../ranking/traindata/LogRankTrainData.java | 8 +- .../traindata/LogRankTrainDataExtractor.java | 159 +++++++++++++ .../traindata/RankTrainDataFactory.java | 66 +++++- 17 files changed, 584 insertions(+), 401 deletions(-) delete mode 100644 core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java rename ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/{Learner.java => RankLearner.java} (84%) delete mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLRankLearner.java create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/ND4JFormatter.java rename ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/{SVMLearner.java => RankSVMLearner.java} (62%) delete mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkSVM.java rename core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/RankingTrainData.java => ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/LogRankTrainData.java (94%) create mode 100644 ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/LogRankTrainDataExtractor.java diff --git a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java deleted file mode 100644 index de41d56..0000000 --- a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java +++ /dev/null @@ -1,54 +0,0 @@ -package org.apache.sdap.mudrod.weblog.pre; - -import org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract; -import org.apache.sdap.mudrod.driver.ESDriver; -import org.apache.sdap.mudrod.driver.SparkDriver; -import org.apache.sdap.mudrod.weblog.structure.session.RankingTrainData; -import org.apache.sdap.mudrod.weblog.structure.session.SessionExtractor; -import org.apache.spark.api.java.JavaRDD; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Properties; - -public class RankingTrainDataGenerator extends DiscoveryStepAbstract { - - private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(RankingTrainDataGenerator.class); - - public RankingTrainDataGenerator(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - // TODO Auto-generated constructor stub - } - - @Override - public Object execute() { - // TODO Auto-generated method stub - LOG.info("Starting generate ranking train data."); - startTime = System.currentTimeMillis(); - - String rankingTrainFile = "E:\\Mudrod_input_data\\Testing_Data_4_1monthLog+Meta+Onto\\traing.txt"; - try { - SessionExtractor extractor = new SessionExtractor(); - JavaRDD rankingTrainDataRDD = extractor.extractRankingTrainData(this.props, this.es, this.spark); - - JavaRDD rankingTrainData_JsonRDD = rankingTrainDataRDD.map(f -> f.toJson()); - - rankingTrainData_JsonRDD.coalesce(1, true).saveAsTextFile(rankingTrainFile); - - } catch (Exception e) { - e.printStackTrace(); - } - - endTime = System.currentTimeMillis(); - LOG.info("Ranking train data generation complete. Time elapsed {} seconds.", (endTime - startTime) / 1000); - return null; - } - - @Override - public Object execute(Object o) { - // TODO Auto-generated method stub - return null; - } - -} diff --git a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/Session.java b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/Session.java index 2c917a6..7a2ef8b 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/Session.java +++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/Session.java @@ -203,7 +203,7 @@ public List getClickStreamList(String indexName, String type, Strin * @return an instance of session tree structure * @throws UnsupportedEncodingException UnsupportedEncodingException */ - private SessionTree getSessionTree(String indexName, String type, String sessionID) throws UnsupportedEncodingException { + public SessionTree getSessionTree(String indexName, String type, String sessionID) throws UnsupportedEncodingException { SearchResponse response = es.getClient().prepareSearch(indexName).setTypes(type).setQuery(QueryBuilders.termQuery("SessionID", sessionID)).setSize(100).addSort("Time", SortOrder.ASC) .execute().actionGet(); @@ -261,31 +261,4 @@ private JsonElement getRequests(String cleanuptype, String sessionID) throws Uns } return gson.toJsonTree(requestList); } - - /** - * getClickStreamList: Extracted ranking training data from current session. - * - * @param indexName an index from which to obtain ranked training data. - * @param cleanuptype: Session type name in Elasticsearch - * @param sessionID: Session ID - * @return Click stram data list - * {@link ClickStream} - */ - public List getRankingTrainData(String indexName, String cleanuptype, String sessionID) { - SessionTree tree = null; - try { - tree = this.getSessionTree(indexName, cleanuptype, sessionID); - } catch (UnsupportedEncodingException e) { - LOG.error("Error whilst retreiving Session Tree: {}", e); - } - - List trainData = new ArrayList<>(); - try { - trainData = tree.getRankingTrainData(indexName); - } catch (UnsupportedEncodingException e) { - LOG.error("Error whilst retreiving ranking training data: {}", e); - } - - return trainData; - } } diff --git a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionExtractor.java b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionExtractor.java index f7eb602..8b6dc9d 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionExtractor.java +++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionExtractor.java @@ -205,7 +205,7 @@ public List call(List v1, List v2) throws Exception { * a log index name * @return list of session names */ - protected List getSessions(Properties props, ESDriver es, String logIndex) { + public List getSessions(Properties props, ESDriver es, String logIndex) { String cleanupType = MudrodConstants.CLEANUP_TYPE; String sessionStatType = MudrodConstants.SESSION_STATS_TYPE; @@ -383,102 +383,4 @@ public Tuple2> call(String sessionitem) throws Exception { } }); } - - /** - * extractClickStreamFromES:Extract click streams from logs stored in - * Elasticsearch - * - * @param props - * the Mudrod configuration - * @param es - * the Elasticsearch drive - * @param spark - * the spark driver - * @return clickstream list in JavaRDD format {@link ClickStream} - */ - public JavaRDD extractRankingTrainData(Properties props, ESDriver es, SparkDriver spark) { - - List queryList = this.extractRankingTrainData(props, es); - return spark.sc.parallelize(queryList); - - } - - /** - * getClickStreamList:Extract click streams from logs stored in Elasticsearch. - * - * @param props - * the Mudrod configuration - * @param es - * the Elasticsearch driver - * @return clickstream list {@link ClickStream} - */ - protected List extractRankingTrainData(Properties props, ESDriver es) { - List logIndexList = es.getIndexListWithPrefix(props.getProperty(MudrodConstants.LOG_INDEX)); - - LOG.info(logIndexList.toString()); - - List result = new ArrayList<>(); - for (String logIndex : logIndexList) { - List sessionIdList; - try { - sessionIdList = this.getSessions(props, es, logIndex); - Session session = new Session(props, es); - for (String aSessionIdList : sessionIdList) { - String[] sArr = aSessionIdList.split(","); - List datas = session.getRankingTrainData(sArr[1], sArr[2], sArr[0]); - result.addAll(datas); - } - } catch (Exception e) { - LOG.error("Error which extracting ranking train data: {}", e); - } - } - - return result; - } - - protected JavaRDD extractRankingTrainDataInParallel(Properties props, SparkDriver spark, ESDriver es) { - - List logIndexList = es.getIndexListWithPrefix(props.getProperty(MudrodConstants.LOG_INDEX)); - - LOG.info(logIndexList.toString()); - - List sessionIdList = new ArrayList<>(); - for (String logIndex : logIndexList) { - List tmpsessionList = this.getSessions(props, es, logIndex); - sessionIdList.addAll(tmpsessionList); - } - - JavaRDD sessionRDD = spark.sc.parallelize(sessionIdList, 16); - - JavaRDD clickStreamRDD = sessionRDD.mapPartitions( - new FlatMapFunction, RankingTrainData>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Iterator call(Iterator arg0) throws Exception { - ESDriver tmpES = new ESDriver(props); - tmpES.createBulkProcessor(); - - Session session = new Session(props, tmpES); - List clickstreams = new ArrayList<>(); - while (arg0.hasNext()) { - String s = arg0.next(); - String[] sArr = s.split(","); - List clicks = session.getRankingTrainData(sArr[1], sArr[2], sArr[0]); - clickstreams.addAll(clicks); - } - tmpES.destroyBulkProcessor(); - tmpES.close(); - return clickstreams.iterator(); - } - }); - - LOG.info("Clickstream number: {}", clickStreamRDD.count()); - - return clickStreamRDD; - } - } diff --git a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionTree.java b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionTree.java index 5531f83..abe1ee9 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionTree.java +++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionTree.java @@ -90,6 +90,14 @@ public SessionTree(Properties props, ESDriver es, String sessionID, String clean this.sessionID = sessionID; this.cleanupType = cleanupType; } + + public SessionNode getRoot(){ + return this.root; + } + + public String getSessionId(){ + return this.sessionID; + } /** * insert: insert a node into the session tree. @@ -427,7 +435,7 @@ private List getViewNodes(SessionNode node) { return viewnodes; } - private List getQueryNodes(SessionNode node) { + public List getQueryNodes(SessionNode node) { return this.getNodes(node, MudrodConstants.SEARCH_MARKER); } @@ -447,76 +455,4 @@ private List getNodes(SessionNode node, String nodeKey) { return nodes; } - - /** - * Obtain the ranking training data. - * - * @param indexName the index from whcih to obtain the data - * @return {@link ClickStream} - * @throws UnsupportedEncodingException if there is an error whilst - * processing the ranking training data. - */ - public List getRankingTrainData(String indexName) throws UnsupportedEncodingException { - - List trainDatas = new ArrayList<>(); - - List queryNodes = this.getQueryNodes(this.root); - for (SessionNode querynode : queryNodes) { - List children = querynode.getChildren(); - - LinkedHashMap datasetOpt = new LinkedHashMap<>(); - int ndownload = 0; - for (SessionNode node : children) { - if ("dataset".equals(node.getKey())) { - Boolean bDownload = false; - List nodeChildren = node.getChildren(); - for (SessionNode aNodeChildren : nodeChildren) { - if ("ftp".equals(aNodeChildren.getKey())) { - bDownload = true; - ndownload += 1; - break; - } - } - datasetOpt.put(node.datasetId, bDownload); - } - } - - // method 1: The priority of download data are higher - if (datasetOpt.size() > 1 && ndownload > 0) { - // query - RequestUrl requestURL = new RequestUrl(); - String queryUrl = querynode.getRequest(); - String infoStr = requestURL.getSearchInfo(queryUrl); - String query = null; - try { - query = es.customAnalyzing(props.getProperty(MudrodConstants.ES_INDEX_NAME), infoStr); - } catch (InterruptedException | ExecutionException e) { - throw new RuntimeException("Error performing custom analyzing", e); - } - Map filter = RequestUrl.getFilterInfo(queryUrl); - - for (String datasetA : datasetOpt.keySet()) { - Boolean bDownloadA = datasetOpt.get(datasetA); - if (bDownloadA) { - for (String datasetB : datasetOpt.keySet()) { - Boolean bDownloadB = datasetOpt.get(datasetB); - if (!bDownloadB) { - - String[] queries = query.split(","); - for (String query1 : queries) { - RankingTrainData trainData = new RankingTrainData(query1, datasetA, datasetB); - trainData.setSessionId(this.sessionID); - trainData.setIndex(indexName); - trainData.setFilter(filter); - trainDatas.add(trainData); - } - } - } - } - } - } - } - - return trainDatas; - } } diff --git a/ranking/pom.xml b/ranking/pom.xml index 591fda2..428b03b 100644 --- a/ranking/pom.xml +++ b/ranking/pom.xml @@ -26,6 +26,8 @@ javaSVMWithSGDModel + 1.0.0-beta + nd4j-native-platform @@ -35,6 +37,21 @@ mudrod-core ${project.version} + + org.nd4j + nd4j-native-platform + ${nd4j.version} + + + org.nd4j + ${nd4j.backend} + ${nd4j.version} + + + org.deeplearning4j + deeplearning4j-core + ${nd4j.version} + diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java index 4da1b49..96f2f29 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/LearnerFactory.java @@ -17,7 +17,8 @@ import org.apache.sdap.mudrod.driver.ESDriver; import org.apache.sdap.mudrod.driver.SparkDriver; import org.apache.sdap.mudrod.main.MudrodConstants; -import org.apache.sdap.mudrod.ranking.ranksvm.SVMLearner; +import org.apache.sdap.mudrod.ranking.dlrank.DLRankLearner; +import org.apache.sdap.mudrod.ranking.ranksvm.RankSVMLearner; import org.apache.spark.SparkContext; import org.apache.spark.mllib.classification.SVMModel; import org.apache.spark.mllib.regression.LabeledPoint; @@ -34,10 +35,12 @@ public LearnerFactory(Properties props, ESDriver es, SparkDriver spark) { super(props, es, spark); } - public Learner createLearner() { - if ("1".equals(props.getProperty(MudrodConstants.RANKING_ML))) + public RankLearner createLearner() { + /*if ("1".equals(props.getProperty(MudrodConstants.RANKING_ML))) return new SVMLearner(props, es, spark, props.getProperty(MudrodConstants.RANKING_MODEL)); - return null; + return null;*/ + return new RankSVMLearner(props, es, spark, props.getProperty(MudrodConstants.RANKING_MODEL)); + //return new DLRankLearner(props, es, spark, ""); } } diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/RankLearner.java similarity index 84% rename from ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/RankLearner.java index c2d78d8..61663cf 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Learner.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/RankLearner.java @@ -29,15 +29,17 @@ /** * learn rank weights from train data and predict search results ranking */ -public abstract class Learner extends MudrodAbstract { +public abstract class RankLearner extends MudrodAbstract { - public Learner(Properties props, ESDriver es, SparkDriver spark) { + public RankLearner(Properties props, ESDriver es, SparkDriver spark) { super(props, es, spark); } - public abstract String customizeTrainData(String sourceDir); + public abstract String customizeData(String sourceDir, String outFileName); public abstract void train(String trainFile); + + public abstract void evaluate(String testFile); public abstract double predict(double[] value); diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java index cb3c9a8..5df0509 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/common/Ranker.java @@ -31,7 +31,7 @@ public class Ranker extends MudrodAbstract implements Serializable { private static final long serialVersionUID = 1L; transient List resultList = new ArrayList<>(); - Learner le = null; + RankLearner le = null; public Ranker(Properties props, ESDriver es, SparkDriver spark) { super(props, es, spark); diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java deleted file mode 100644 index b5fd274..0000000 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLLearner.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ranking.dlrank; - -import java.io.File; -import java.util.Properties; - -import org.apache.sdap.mudrod.driver.ESDriver; -import org.apache.sdap.mudrod.driver.SparkDriver; -import org.apache.sdap.mudrod.main.MudrodEngine; -import org.apache.sdap.mudrod.ranking.common.Learner; -import org.apache.sdap.mudrod.ranking.common.LearnerFactory; -import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.classification.SVMModel; -import org.apache.spark.mllib.classification.SVMWithSGD; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; - -public class DLLearner extends Learner { - private static final long serialVersionUID = 1L; - transient SparkContext sc = null; - - public DLLearner(Properties props, ESDriver es, SparkDriver spark, String dlModel) { - super(props, es, spark); - - sc = spark.sc.sc(); - load(dlModel); - } - - @Override - public void train(String trainFile) { - - } - - @Override - public double predict(double[] value) { - return 0; - } - - @Override - public void save() { - } - - @Override - public void load(String svmSgdModel) { - } - - @Override - public String customizeTrainData(String sourceDir) { - return null; - } -} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLRankLearner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLRankLearner.java new file mode 100644 index 0000000..2325b6c --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/dlrank/DLRankLearner.java @@ -0,0 +1,212 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.dlrank; + +import java.io.File; +import java.io.IOException; +import java.util.Properties; + +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.main.MudrodEngine; +import org.apache.sdap.mudrod.ranking.common.RankLearner; +import org.apache.sdap.mudrod.ranking.common.LearnerFactory; +import org.apache.sdap.mudrod.ranking.ranksvm.RankSVMLearner; +import org.apache.sdap.mudrod.ranking.ranksvm.SparkFormatter; +import org.apache.sdap.mudrod.ranking.traindata.RankTrainDataFactory; +import org.datavec.api.records.reader.RecordReader; +import org.datavec.api.records.reader.impl.csv.CSVRecordReader; +import org.datavec.api.split.FileSplit; +import org.datavec.api.util.ClassPathResource; +import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator; +import org.deeplearning4j.eval.Evaluation; +import org.deeplearning4j.nn.conf.MultiLayerConfiguration; +import org.deeplearning4j.nn.conf.NeuralNetConfiguration; +import org.deeplearning4j.nn.conf.layers.DenseLayer; +import org.deeplearning4j.nn.conf.layers.OutputLayer; +import org.deeplearning4j.nn.multilayer.MultiLayerNetwork; +import org.deeplearning4j.nn.weights.WeightInit; +import org.deeplearning4j.optimize.listeners.ScoreIterationListener; +import org.nd4j.linalg.activations.Activation; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.factory.Nd4j; +import org.nd4j.linalg.learning.config.Nesterovs; +import org.nd4j.linalg.lossfunctions.LossFunctions.LossFunction; +import org.nd4j.linalg.dataset.DataSet; +import org.nd4j.linalg.dataset.api.iterator.DataSetIterator; + +public class DLRankLearner extends RankLearner { + + MultiLayerNetwork model = null; + + /** + * Constructor to train rank model with deep learning method + * + * @param classifierName + * classifier type + * @param skd + * an instance of spark driver + * @param svmSgdModel + * path to a trained model + */ + public DLRankLearner(Properties props, ESDriver es, SparkDriver spark, String dlModel) { + super(props, es, spark); + load(dlModel); + } + + @Override + public String customizeData(String sourceDir, String outFileName) { + RankTrainDataFactory factory = new RankTrainDataFactory(props, es, spark); + String resultFile = factory.createRankTrainData("experts", sourceDir); + + String path = new File(resultFile).getParent(); + String separator = System.getProperty("file.separator"); + String nd4jFile = path + separator + outFileName + ".csv"; + ND4JFormatter sf = new ND4JFormatter(); + sf.toND4Jformat(resultFile, nd4jFile); + return nd4jFile; + } + + @Override + public void train(String trainFile) { + //init model + if(model == null){ + + int seed = 123; + double learningRate = 0.01; + int numInputs = 7; + int numOutputs = 2; + int numHiddenNodes = 20; + MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() + .seed(seed) + .updater(new Nesterovs(learningRate, 0.9)) + .list() + .layer(0, new DenseLayer.Builder().nIn(numInputs).nOut(numHiddenNodes) + .weightInit(WeightInit.XAVIER) + .activation(Activation.RELU) + .build()) + .layer(1, new OutputLayer.Builder(LossFunction.NEGATIVELOGLIKELIHOOD) + .weightInit(WeightInit.XAVIER) + .activation(Activation.SOFTMAX) + .nIn(numHiddenNodes).nOut(numOutputs).build()) + .pretrain(false).backprop(true).build(); + + model = new MultiLayerNetwork(conf); + model.init(); + } + + //Load the training data + int numLinesToSkip = 1; + char delimiter = ','; + RecordReader rr = new CSVRecordReader(numLinesToSkip,delimiter); + try { + rr.initialize(new FileSplit(new File(trainFile))); + } catch (IOException | InterruptedException e) { + e.printStackTrace(); + } + + //train model + int batchSize = 50; + int nEpochs = 30; + DataSetIterator trainIter = new RecordReaderDataSetIterator(rr,batchSize,0,2); + for ( int n = 0; n < nEpochs; n++) { + model.fit( trainIter ); + } + } + + @Override + public void evaluate(String testFile) { + //Load the test/evaluation data: + int batchSize = 50; + int numOutputs = 2; + int numLinesToSkip = 1; + char delimiter = ','; + RecordReader rrTest = new CSVRecordReader(numLinesToSkip,delimiter); + try { + rrTest.initialize(new FileSplit(new File(testFile))); + } catch (IOException | InterruptedException e) { + e.printStackTrace(); + } + DataSetIterator testIter = new RecordReaderDataSetIterator(rrTest,batchSize,0,2); + + System.out.println("Evaluate model...."); + Evaluation eval = new Evaluation(numOutputs); + while(testIter.hasNext()){ + DataSet t = testIter.next(); + INDArray features = t.getFeatureMatrix(); + INDArray lables = t.getLabels(); + INDArray predicted = model.output(features,false); + eval.eval(lables, predicted); + } + System.out.println(eval.stats()); + } + + @Override + public double predict(double[] value) { + int nRows = 1; + int nColumns = value.length; + INDArray features = Nd4j.zeros(nRows, nColumns); + for(int i=0; i 0; i--) { + list[i] = list[i - 1].replace("\"", ""); + } + list[0] = output; + + csvOutput.writeNext(list); // Write this array to the file + line = br.readLine(); + } + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/RankSVMLearner.java similarity index 62% rename from ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/RankSVMLearner.java index 2687482..a0f2c47 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SVMLearner.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/RankSVMLearner.java @@ -24,22 +24,28 @@ import org.apache.sdap.mudrod.driver.ESDriver; import org.apache.sdap.mudrod.driver.SparkDriver; import org.apache.sdap.mudrod.main.MudrodEngine; -import org.apache.sdap.mudrod.ranking.common.Learner; +import org.apache.sdap.mudrod.ranking.common.RankLearner; import org.apache.sdap.mudrod.ranking.common.LearnerFactory; import org.apache.sdap.mudrod.ranking.traindata.RankTrainDataFactory; import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.mllib.classification.SVMModel; import org.apache.spark.mllib.classification.SVMWithSGD; +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; +import scala.Tuple2; + /** * Learn ranking weights with SVM model */ -public class SVMLearner extends Learner { +public class RankSVMLearner extends RankLearner { /** * */ @@ -57,21 +63,21 @@ public class SVMLearner extends Learner { * @param svmSgdModel * path to a trained model */ - public SVMLearner(Properties props, ESDriver es, SparkDriver spark, String svmSgdModel) { + public RankSVMLearner(Properties props, ESDriver es, SparkDriver spark, String svmSgdModel) { super(props, es, spark); - sc = spark.sc.sc(); load(svmSgdModel); } - public String customizeTrainData(String sourceDir) { + @Override + public String customizeData(String sourceDir, String outFileName) { RankTrainDataFactory factory = new RankTrainDataFactory(props, es, spark); - String resultFile = factory.createTrainData(sourceDir); + String resultFile = factory.createRankTrainData("experts", sourceDir); String path = new File(resultFile).getParent(); String separator = System.getProperty("file.separator"); - String svmSparkFile = path + separator + "inputDataForSVM_spark.txt"; + String svmSparkFile = path + separator + outFileName + ".txt"; SparkFormatter sf = new SparkFormatter(); sf.toSparkSVMformat(resultFile, svmSparkFile); @@ -85,17 +91,48 @@ public void train(String trainFile) { int numIterations = 100; model = SVMWithSGD.train(data.rdd(), numIterations); } + + @Override + public void evaluate(String testFile) { + JavaRDD data = MLUtils.loadLibSVMFile(sc, testFile).toJavaRDD(); + // Run training algorithm to build the model. + JavaRDD> scoreAndLabels = data.map(p->{ + double score = model.predict(p.features()); + return new Tuple2<>(score, p.label()); + }); + BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(scoreAndLabels.rdd()); + System.out.println("Area under ROC = " + metrics.areaUnderROC()); + long correctNum = scoreAndLabels.filter(new Function, Boolean>(){ + @Override + public Boolean call(Tuple2 arg0) throws Exception { + Integer predict = (Integer)arg0._1(); + Integer label = (Integer)arg0._2(); + int output = 0; + if (label == -1.0) { + output = 0; + } else if (label == 1.0) { + output = 1; + } + + if(predict == output){ + return true; + } + return false; + } + }).count(); + System.out.println("Accuracy = " + correctNum/scoreAndLabels.count()); + } @Override public double predict(double[] value) { LabeledPoint p = new LabeledPoint(99.0, Vectors.dense(value)); - return model.predict(p.features()); + return model.predict(p.features()); } @Override public void save() { // Save model - String modelPath = SVMLearner.class.getClassLoader().getResource("javaSVMWithSGDModel").toString(); + String modelPath = RankSVMLearner.class.getClassLoader().getResource("javaSVMWithSGDModel").toString(); model.save(sc, modelPath); } @@ -114,8 +151,7 @@ public static void main(String[] arg0) { ESDriver es = new ESDriver(props); LearnerFactory factory = new LearnerFactory(props, es, spark); - Learner le = factory.createLearner(); - + RankLearner le = factory.createLearner(); String sourceDir = arg0[0]; String trainFile = le.customizeTrainData(sourceDir); le.train(trainFile); diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkSVM.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkSVM.java deleted file mode 100644 index abedb76..0000000 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/ranksvm/SparkSVM.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.sdap.mudrod.ranking.ranksvm; - -import org.apache.sdap.mudrod.main.MudrodEngine; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.classification.SVMModel; -import org.apache.spark.mllib.classification.SVMWithSGD; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; - -public class SparkSVM { - - private SparkSVM() { - //public constructor - } - - public static void main(String[] args) { - MudrodEngine me = new MudrodEngine(); - - JavaSparkContext jsc = me.startSparkDriver().sc; - - String path = SparkSVM.class.getClassLoader().getResource("inputDataForSVM_spark.txt").toString(); - JavaRDD data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); - - // Run training algorithm to build the model. - int numIterations = 100; - final SVMModel model = SVMWithSGD.train(data.rdd(), numIterations); - - // Save and load model - model.save(jsc.sc(), SparkSVM.class.getClassLoader().getResource("javaSVMWithSGDModel").toString()); - - jsc.sc().stop(); - } -} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertRankTrainData.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertRankTrainData.java index d0f8102..c0dcda9 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertRankTrainData.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/ExpertRankTrainData.java @@ -151,7 +151,7 @@ public static void calculateVec(String[][] arr) { { for (int i = 1; i < arr.length - row; i++) { List colList = new ArrayList(); // create vector to store all values inside of a column, which is stored inside 2D vector - for (int col = 1; col < arr[0].length - 2; col++) // Columns go until the next to last column + for (int col = 1; col < arr[0].length - 1; col++) // Columns go until the next to last column { // Extract double value from each cell double x1 = Double.parseDouble(arr[row][col]); diff --git a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/RankingTrainData.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/LogRankTrainData.java similarity index 94% rename from core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/RankingTrainData.java rename to ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/LogRankTrainData.java index bdf477a..d8265ca 100644 --- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/RankingTrainData.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/LogRankTrainData.java @@ -1,4 +1,4 @@ -package org.apache.sdap.mudrod.weblog.structure.session; +package org.apache.sdap.mudrod.ranking.traindata; import java.io.Serializable; import java.util.Map; @@ -6,7 +6,7 @@ /** * ClassName: train data extracted from web logs for training ranking weightss. */ -public class RankingTrainData implements Serializable { +public class LogRankTrainData implements Serializable { /** * */ @@ -31,13 +31,13 @@ public class RankingTrainData implements Serializable { * @param highRankDataset the dataset name for the highest ranked dataset * @param lowRankDataset the dataset name for the lowest ranked dataset */ - public RankingTrainData(String query, String highRankDataset, String lowRankDataset) { + public LogRankTrainData(String query, String highRankDataset, String lowRankDataset) { this.query = query; this.highRankDataset = highRankDataset; this.lowRankDataset = lowRankDataset; } - public RankingTrainData() { + public LogRankTrainData() { //default constructor } diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/LogRankTrainDataExtractor.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/LogRankTrainDataExtractor.java new file mode 100644 index 0000000..44c82cf --- /dev/null +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/LogRankTrainDataExtractor.java @@ -0,0 +1,159 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ranking.traindata; + +import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.main.MudrodConstants; +import org.apache.sdap.mudrod.weblog.structure.log.RequestUrl; +import org.apache.sdap.mudrod.weblog.structure.session.ClickStream; +import org.apache.sdap.mudrod.weblog.structure.session.Session; +import org.apache.sdap.mudrod.weblog.structure.session.SessionExtractor; +import org.apache.sdap.mudrod.weblog.structure.session.SessionNode; +import org.apache.sdap.mudrod.weblog.structure.session.SessionTree; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.FlatMapFunction; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.concurrent.ExecutionException; + +/** + * ClassName: SessionExtractor Function: Extract sessions details from + * reconstructed sessions. + */ +public class LogRankTrainDataExtractor extends MudrodAbstract { + + public LogRankTrainDataExtractor(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + // TODO Auto-generated constructor stub + } + + /** + * extractTrainDataFromIndex: extract rank train data from logs stored in Elasticsearch. + * + * @param logIndices + * the index name + * @return LogRankTrainData list {@link LogRankTrainData} + */ + public JavaRDD extractTrainDataFromIndices(String logIndices) throws UnsupportedEncodingException { + List result = new ArrayList<>(); + SessionExtractor sessionExtrator = new SessionExtractor(); + List sessions = sessionExtrator.getSessions(props, es, logIndices); + + JavaRDD sessionRDD = spark.sc.parallelize(sessions, 16); + JavaRDD trainDataRDD = sessionRDD.mapPartitions(new FlatMapFunction, LogRankTrainData>() { + private static final long serialVersionUID = 1L; + @Override + public Iterator call(Iterator arg0) throws Exception { + ESDriver tmpES = new ESDriver(props); + tmpES.createBulkProcessor(); + List trainData = new ArrayList<>(); + while (arg0.hasNext()) { + String s = arg0.next(); + String[] sArr = s.split(","); + List data = extractTrainDataFromSession(sArr[1], sArr[2], sArr[0]); + trainData.addAll(data); + } + tmpES.destroyBulkProcessor(); + tmpES.close(); + return trainData.iterator(); + } + }); + + return trainDataRDD; + } + + /** + * Obtain the ranking training data. + * + * @param indexName + * the index from whcih to obtain the data + * @return {@link ClickStream} + * @throws UnsupportedEncodingException + * if there is an error whilst processing the ranking training data. + */ + public List extractTrainDataFromSession(String indexName, String cleanuptype, String sessionID) throws UnsupportedEncodingException { + + List trainDatas = new ArrayList<>(); + + Session session = new Session(props, es); + SessionTree tree = null; + tree = session.getSessionTree(indexName, cleanuptype, sessionID); + + List queryNodes = tree.getQueryNodes(tree.getRoot()); + for (SessionNode querynode : queryNodes) { + List children = querynode.getChildren(); + + LinkedHashMap datasetOpt = new LinkedHashMap<>(); + int ndownload = 0; + for (SessionNode node : children) { + if ("dataset".equals(node.getKey())) { + Boolean bDownload = false; + List nodeChildren = node.getChildren(); + for (SessionNode aNodeChildren : nodeChildren) { + if ("ftp".equals(aNodeChildren.getKey())) { + bDownload = true; + ndownload += 1; + break; + } + } + datasetOpt.put(node.getDatasetId(), bDownload); + } + } + + // method 1: The priority of download data are higher + if (datasetOpt.size() > 1 && ndownload > 0) { + // query + RequestUrl requestURL = new RequestUrl(); + String queryUrl = querynode.getRequest(); + String infoStr = requestURL.getSearchInfo(queryUrl); + String query = null; + try { + query = es.customAnalyzing(props.getProperty(MudrodConstants.ES_INDEX_NAME), infoStr); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException("Error performing custom analyzing", e); + } + Map filter = RequestUrl.getFilterInfo(queryUrl); + + for (String datasetA : datasetOpt.keySet()) { + Boolean bDownloadA = datasetOpt.get(datasetA); + if (bDownloadA) { + for (String datasetB : datasetOpt.keySet()) { + Boolean bDownloadB = datasetOpt.get(datasetB); + if (!bDownloadB) { + + String[] queries = query.split(","); + for (String query1 : queries) { + LogRankTrainData trainData = new LogRankTrainData(query1, datasetA, datasetB); + trainData.setSessionId(tree.getSessionId()); + trainData.setIndex(indexName); + trainData.setFilter(filter); + trainDatas.add(trainData); + } + } + } + } + } + } + } + + return trainDatas; + } +} diff --git a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java index 6594d20..baea867 100644 --- a/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java +++ b/ranking/src/main/java/org/apache/sdap/mudrod/ranking/traindata/RankTrainDataFactory.java @@ -14,10 +14,12 @@ package org.apache.sdap.mudrod.ranking.traindata; import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract; +import org.apache.sdap.mudrod.discoveryengine.WeblogDiscoveryEngine; import org.apache.sdap.mudrod.driver.ESDriver; import org.apache.sdap.mudrod.driver.SparkDriver; import org.apache.sdap.mudrod.main.MudrodConstants; -import org.apache.sdap.mudrod.ranking.ranksvm.SVMLearner; +import org.apache.sdap.mudrod.main.MudrodEngine; +import org.apache.sdap.mudrod.ranking.ranksvm.RankSVMLearner; import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.mllib.classification.SVMModel; @@ -25,24 +27,39 @@ import java.io.File; import java.io.Serializable; +import java.io.UnsupportedEncodingException; +import java.util.List; import java.util.Properties; /** - * Create train data from difference sources, including experts provided data, offline logs and realtime logs + * Create train data from difference sources, including experts provided data, + * offline logs and realtime logs */ -public class RankTrainDataFactory extends MudrodAbstract{ +public class RankTrainDataFactory extends MudrodAbstract { public RankTrainDataFactory(Properties props, ESDriver es, SparkDriver spark) { super(props, es, spark); } - public String createTrainData(String sourceDir) { - - return createTrainDataFromExperts(sourceDir); + public String createRankTrainData(String source, String sourceDir) { + String result = ""; + switch (source) { + + case "experts": + result = createRankTrainDataFromExperts(sourceDir); + break; + case "index": + result = createRankTrainDataFromIndices(sourceDir); + break; + case "log": + result = createRankTrainDataFromLog(sourceDir); + break; + } + return result; } - public String createTrainDataFromExperts(String sourceDir) { + public String createRankTrainDataFromExperts(String sourceDir) { File sourceFile = new File(sourceDir); boolean bDir = sourceFile.isDirectory(); boolean multFiles = false; @@ -50,14 +67,43 @@ public String createTrainDataFromExperts(String sourceDir) { multFiles = true; } - String resultDir = sourceFile.getParent() + "/trainsets.txt"; + String separator = System.getProperty("file.separator"); + String resultDir = sourceFile.getParent() + separator + "experts_trainsets.csv"; ExpertRankTrainData converter = new ExpertRankTrainData(sourceDir, resultDir, true); converter.convertToTrainSet(); return resultDir; } - public String createTrainDataFromOfflineLogs(String trainsetFile, int start, int mode) { - return ""; + public String createRankTrainDataFromIndices(String indexNames) { + + if (indexNames == "") { + List logIndexList = es.getIndexListWithPrefix(props.getProperty(MudrodConstants.LOG_INDEX)); + indexNames = String.join(",", logIndexList); + } + + LogRankTrainDataExtractor extractor = new LogRankTrainDataExtractor(props, es, spark); + JavaRDD trainDataRDD = null; + try { + trainDataRDD = extractor.extractTrainDataFromIndices(indexNames); + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + String trainSetFile = "E:\\Mudrod_input_data\\Testing_Data_4_1monthLog+Meta+Onto\\traing.txt"; + JavaRDD jsonRDD = trainDataRDD.map(f -> f.toJson()); + jsonRDD.coalesce(1, true).saveAsTextFile(trainSetFile); + + return trainSetFile; + } + + public String createRankTrainDataFromLog(String logDir) { + + WeblogDiscoveryEngine webEngine = new WeblogDiscoveryEngine(props, es, spark); + String indices = webEngine.logIngestAndParse(logDir); + + String trainSetFile = createRankTrainDataFromIndices(indices); + return trainSetFile; } }