diff --git a/DESCRIPTION b/DESCRIPTION index d4aaf01..9d79cb5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,3 +29,4 @@ Author: Ingo Feinerer [aut, cre] (), Maintainer: Ingo Feinerer Repository: CRAN Date/Publication: 2018-12-21 13:55:26 UTC +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index 5f2c0fd..ccc203e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -92,6 +92,8 @@ export("as.DocumentTermMatrix", "weightTfIdf", "weightBin", "weightSMART", + "weightModel", + "weightSMART2", "writeCorpus", "XMLSource", "XMLTextDocument", diff --git a/R/weight.R b/R/weight.R index 8890646..f286006 100644 --- a/R/weight.R +++ b/R/weight.R @@ -41,6 +41,117 @@ weightTfIdf <- if (isDTM) t(m) else m }, "term frequency - inverse document frequency", "tf-idf") +weightModel <- function(m, spec = "nnn", control = list()) +{ + docfreq <- row_sums(m > 0) + mode(docfreq) <- "integer" + + return(list("docfreq" = docfreq, + "ndoc" = nDocs(m))) +} + +weightSMART2 <- WeightFunction(function(m, spec = "nnn", control = list()) { + stopifnot(inherits(m, c("DocumentTermMatrix", "TermDocumentMatrix")), + is.character(spec), nchar(spec) == 3L, is.list(control)) + + term_frequency <- + match.arg(substr(spec, 1L, 1L), + c("n", "l", "a", "b", "L")) + document_frequency <- + match.arg(substr(spec, 2L, 2L), + c("n", "t", "p")) + normalization <- + match.arg(substr(spec, 3L, 3L), + c("n", "c", "u", "b")) + + isDTM <- inherits(m, "DocumentTermMatrix") + if (isDTM) m <- t(m) + + if (normalization == "b") { + ## Need to compute the character lengths of the documents + ## before starting the weighting. + charlengths <- + tapply(nchar(Terms(m))[m$i] * m$v, m$j, sum) + } + + ## Term frequency + m$v <- switch(term_frequency, + ## natural + n = m$v, + ## logarithm + l = 1 + log2(m$v), + ## augmented + a = { + s <- tapply(m$v, m$j, max) + 0.5 + (0.5 * m$v) / s[as.character(m$j)] + }, + ## boolean + b = as.numeric(m$v > 0), + ## log ave + L = { + s <- tapply(m$v, m$j, mean) + ((1 + log2(m$v)) / (1 + log2(s[as.character(m$j)]))) + }) + + ## Document frequency + if(!is.null(control$docfreq)) + rs <- control$docfreq + else + rs <- row_sums(m > 0) + if(!is.null(control$ndoc)) + ndoc <- control$ndoc + else + ndoc <- nDocs(m) + + if (any(rs == 0)) + warning("unreferenced term(s): ", + paste(Terms(m)[rs == 0], collapse = " ")) + df <- switch(document_frequency, + ## natural + n = 1, + ## idf + t = log2(ndoc / rs), + ## prob idf + p = max(0, log2((ndoc - rs) / rs))) + df[!is.finite(df)] <- 0 + + ## Normalization + cs <- col_sums(m) + if (any(cs == 0)) + warning("empty document(s): ", + paste(Docs(m)[cs == 0], collapse = " ")) + norm <- switch(normalization, + ## none + n = rep.int(1, nDocs(m)), + ## cosine + c = sqrt(col_sums(m ^ 2)), + ## pivoted unique + u = { + if (is.null(pivot <- control$pivot)) + stop("invalid control argument pivot") + if (is.null(slope <- control$slope)) + stop("invalid control argument slope") + (slope * sqrt(col_sums(m ^ 2)) + + (1 - slope) * pivot) + }, + ## byte size + b = { + if (is.null(alpha <- control$alpha)) + stop("invalid control argument alpha") + norm <- double(nDocs(m)) + norm[match(names(charlengths), + seq_along(norm))] <- + charlengths ^ alpha + norm + }) + + m <- m * df + m$v <- m$v / norm[m$j] + attr(m, "weighting") <- c(paste("SMART", spec), "SMART") + + m <- if (isDTM) t(m) else m +}, "SMART2", "SMART2") + weightSMART <- WeightFunction(function(m, spec = "nnn", control = list()) { stopifnot(inherits(m, c("DocumentTermMatrix", "TermDocumentMatrix")), diff --git a/src/RcppExports.o b/src/RcppExports.o new file mode 100644 index 0000000..4907210 Binary files /dev/null and b/src/RcppExports.o differ diff --git a/src/copy.o b/src/copy.o new file mode 100644 index 0000000..294a12c Binary files /dev/null and b/src/copy.o differ diff --git a/src/init.o b/src/init.o new file mode 100644 index 0000000..ee94330 Binary files /dev/null and b/src/init.o differ diff --git a/src/remove.o b/src/remove.o new file mode 100644 index 0000000..8b7d27c Binary files /dev/null and b/src/remove.o differ diff --git a/src/scan.o b/src/scan.o new file mode 100644 index 0000000..1511c29 Binary files /dev/null and b/src/scan.o differ diff --git a/src/tdm.o b/src/tdm.o new file mode 100644 index 0000000..6177c4b Binary files /dev/null and b/src/tdm.o differ diff --git a/src/tm.so b/src/tm.so new file mode 100755 index 0000000..9ac6351 Binary files /dev/null and b/src/tm.so differ diff --git a/src/tokenizer.o b/src/tokenizer.o new file mode 100644 index 0000000..a91a0f1 Binary files /dev/null and b/src/tokenizer.o differ