diff --git a/core/pom.xml b/core/pom.xml index 36e2a49f3..5081e9623 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -109,6 +109,12 @@ jena-arq + + com.github.tototoshi + scala-csv_2.11 + 1.3.10 + + com.google.guava guava diff --git a/core/src/main/scala/org/dbpedia/extraction/util/WikiInfo.scala b/core/src/main/scala/org/dbpedia/extraction/util/WikiInfo.scala index 6ef380c4a..0146ff4bd 100644 --- a/core/src/main/scala/org/dbpedia/extraction/util/WikiInfo.scala +++ b/core/src/main/scala/org/dbpedia/extraction/util/WikiInfo.scala @@ -1,8 +1,12 @@ package org.dbpedia.extraction.util +import com.github.tototoshi.csv.CSVReader + import java.io.File import java.net.URL import java.util.logging.Logger +import java.io.StringReader +import java.io.StringReader import org.dbpedia.extraction.config.ConfigUtils @@ -47,17 +51,31 @@ object WikiInfo * */ def fromLines(lines: Iterator[String]): Seq[WikiInfo] = { - val info = new ArrayBuffer[WikiInfo] - - if (! lines.hasNext) throw new Exception("empty file") - lines.next // skip first line (headers) - - for (line <- lines) - if (line.nonEmpty) - fromLine(line) match{ - case Some(x) => info += x - case None => - } + val info = new ArrayBuffer[WikiInfo] + + // Join all lines back into a single string for proper CSV parsing + val content = lines.mkString("\n") + val reader = CSVReader.open(new StringReader(content)) + + try { + val allRows = reader.iterator.toSeq + + if (allRows.isEmpty) throw new Exception("empty file") + + // Skip header row + for (row <- allRows.tail) { + if (row.nonEmpty && row.length >= 15) { + val pages = try row(4).toInt catch { case _: NumberFormatException => 0 } + val wikiCode = row(2) + + if (ConfigUtils.LanguageRegex.pattern.matcher(wikiCode).matches) { + info += new WikiInfo(wikiCode, pages) + } + } + } + } finally { + reader.close() + } info } @@ -66,22 +84,28 @@ object WikiInfo * Reads a WikiInfo object from a single CSV line. */ def fromLine(line: String): Option[WikiInfo] = { - val fields = line.split(",", -1) - - if (fields.length < 15) throw new Exception("expected [15] fields, found ["+fields.length+"] in line ["+line+"]") - - val pages = try fields(4).toInt - catch { case nfe: NumberFormatException => 0 } - - val wikiCode = fields(2) - if (! ConfigUtils.LanguageRegex.pattern.matcher(fields(2)).matches) throw new Exception("expected language code in field with index [2], found line ["+line+"]") - - //if(Language.map.keySet.contains(wikiCode)) - Option(new WikiInfo(wikiCode, pages)) - //else - //{ - // logger.log(Level.WARNING, "Language: " + wikiCode + " will be ignored. Add this language to the addonlangs.json file to extract it.") - // None - //} + val reader = CSVReader.open(new StringReader(line)) + try { + val fields = reader.iterator.toSeq.headOption.getOrElse(Seq.empty) + + if (fields.length < 15) { + logger.warning(s"expected [15] fields, found [${fields.length}] in line [${line.take(100)}...]") + return None + } + + val pages = try fields(4).toInt + catch { case nfe: NumberFormatException => 0 } + + val wikiCode = fields(2) + if (!ConfigUtils.LanguageRegex.pattern.matcher(wikiCode).matches) { + logger.warning(s"expected language code in field with index [2], found line [${line.take(100)}...]") + return None + } + + Option(new WikiInfo(wikiCode, pages)) + } finally { + reader.close() + } + } -} +} \ No newline at end of file