diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
new file mode 100644
index 0000000..8e6efee
--- /dev/null
+++ b/.github/workflows/docker-image.yml
@@ -0,0 +1,51 @@
+
+#..........................................................................................................
+# build, push and cache the docker image. I have to adjust the following in case of a different repository:
+# - I have to add the 'BUILD_DATE' arg in the Dockerfile
+# - I have to create a DOCKER_PASSWORD (use the docker token) in the 'Settings' tab of the repository
+# References:
+# - https://github.com/mlampros/IceSat2R/blob/master/.github/workflows/docker_image.yml
+# - https://github.com/orgs/community/discussions/25768#discussioncomment-3249184
+#..........................................................................................................
+
+on:
+ push:
+ branches: [main, master]
+ pull_request:
+ branches: [main, master]
+
+name: docker_img
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ steps:
+ - id: string
+ uses: ASzc/change-string-case-action@v1
+ with:
+ string: ${{ github.event.repository.name }}
+
+ - name: Check Out Repo
+ uses: actions/checkout@v2
+
+ - name: Login to Docker Hub
+ uses: docker/login-action@v2
+ with:
+ username: ${{ github.repository_owner }}
+ password: ${{ secrets.DOCKER_PASSWORD }}
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v1
+
+ - name: Build and push
+ uses: docker/build-push-action@v2
+ with:
+ context: ./
+ build-args: BUILD_DATE="$(date -u +'%Y-%m-%dT%H:%M:%SZ')"
+ file: ./Dockerfile
+ builder: ${{ steps.buildx.outputs.name }}
+ push: true
+ tags: ${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:rstudiodev
+ cache-from: type=registry,ref=${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:buildcache
+ cache-to: type=registry,ref=${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:buildcache,mode=max
diff --git a/.github/workflows/issue.yml b/.github/workflows/issue.yml
index 179031f..ebb037b 100644
--- a/.github/workflows/issue.yml
+++ b/.github/workflows/issue.yml
@@ -13,6 +13,6 @@ jobs:
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
ignore-comments: true
- labels-synonyms: '{"bug":["error","need fix","not working"],"enhancement":["upgrade"],"question":["help"]}'
- labels-not-allowed: '["good first issue"]'
- default-labels: '["help wanted"]'
+ labels-synonyms: '{"bug":["error","need fix","not working"],"enhancement":["upgrade"],"question":["help","how can i"]}'
+ labels-not-allowed: '["documentation","duplicate","good first issue","help wanted","invalid"]'
+ default-labels: '["triage"]'
diff --git a/DESCRIPTION b/DESCRIPTION
index 4401deb..d8a1abc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
Package: textTinyR
Type: Package
Title: Text Processing for Small or Big Data Files
-Version: 1.1.7
-Date: 2021-10-29
+Version: 1.1.8
+Date: 2023-12-04
Authors@R: c( person(given = "Lampros", family = "Mouselimis", email = "mouselimislampros@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "https://orcid.org/0000-0002-8024-1546")))
BugReports: https://github.com/mlampros/textTinyR/issues
URL: https://github.com/mlampros/textTinyR
@@ -16,6 +16,6 @@ Imports: Rcpp (>= 0.12.10), R6, data.table, utils
LinkingTo: Rcpp, RcppArmadillo (>= 0.7.8), BH
Suggests: testthat, covr, knitr, rmarkdown
VignetteBuilder: knitr
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.3
NeedsCompilation: yes
Packaged: 2017-04-01 13:56:22 UTC; lampros
diff --git a/Dockerfile b/Dockerfile
index 961db92..6833e36 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,19 +1,19 @@
FROM rocker/rstudio:devel
-
LABEL maintainer='Lampros Mouselimis'
-
RUN export DEBIAN_FRONTEND=noninteractive; apt-get -y update && \
apt-get install -y zlib1g-dev git-core pandoc pandoc-citeproc libcurl4-openssl-dev libssl-dev && \
apt-get install -y sudo && \
apt-get install -y libarmadillo-dev && \
- R -e "install.packages(c( 'Rcpp', 'R6', 'data.table', 'utils', 'RcppArmadillo', 'BH', 'testthat', 'covr', 'knitr', 'rmarkdown', 'remotes' ), repos = 'https://cloud.r-project.org/' )" && \
- R -e "remotes::install_github('mlampros/textTinyR', upgrade = 'always', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" && \
+ R -e "install.packages(c( 'Rcpp', 'R6', 'data.table', 'utils', 'RcppArmadillo', 'BH', 'testthat', 'covr', 'knitr', 'rmarkdown', 'remotes' ), repos = 'https://cloud.r-project.org/' )"
+
+ADD http://www.random.org/strings/?num=10&len=8&digits=on&upperalpha=on&loweralpha=on&unique=on&format=plain&rnd=new uuid
+ARG BUILD_DATE
+
+RUN echo "$BUILD_DATE"
+RUN R -e "remotes::install_github('mlampros/textTinyR', upgrade = 'always', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" && \
apt-get autoremove -y && \
apt-get clean
-
ENV USER rstudio
-
-
diff --git a/NAMESPACE b/NAMESPACE
index c2ff613..153d24a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -5,6 +5,7 @@ export(Count_Rows)
export(Doc2Vec)
export(JACCARD_DICE)
export(TEXT_DOC_DISSIM)
+export(batch_compute)
export(big_tokenize_transform)
export(bytes_converter)
export(cluster_frequency)
diff --git a/NEWS.md b/NEWS.md
index 19f550f..a60457d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,14 @@
+## textTinyR 1.1.8
+
+* I've fixed the CRAN *warning: format specifies type 'int' but the argument has type 'long long'* in the following files & lines by replacing the `%3d` expression with `%3lld`:
+ * ./token_big_files.h:862:60
+ * ./term_matrix.h:456:75 *and* 647:75
+ * word_vecs_pointer_embedding.cpp:333:67 *and* 240:68
+* I removed the "CXX_STD = CXX11" from the "Makevars" files, and the "[[Rcpp::plugins(cpp11)]]" from the ".cpp" files due to the following NOTE from CRAN, "NOTE Specified C++11: please drop specification unless essential" (see also: https://www.tidyverse.org/blog/2023/03/cran-checks-compiled-code/#note-regarding-systemrequirements-c11)
+* I exported the *batch_calculation()* Rcpp function and created the *batch_compute()* R function
+* I removed the `-mthreads` compilation option from the "Makevars.win" file
+
## textTinyR 1.1.7
diff --git a/R/utils.R b/R/utils.R
index 9ab3b98..c719153 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -645,36 +645,31 @@ utf_locale = function(language = "english") {
#'
#' @usage # utl <- big_tokenize_transform$new(verbose = FALSE)
#' @examples
+#'
+#' \dontrun{
#'
#' library(textTinyR)
#'
-#'
-#' # fs <- big_tokenize_transform$new(verbose = FALSE)
+#' fs <- big_tokenize_transform$new(verbose = FALSE)
#'
#' #---------------
#' # file splitter:
#' #---------------
#'
-#' # fs$big_text_splitter(input_path_file = "input.txt",
-#'
-#' # output_path_folder = "/folder/output/",
-#'
-#' # end_query = "endword", batches = 5,
-#'
-#' # trimmed_line = FALSE)
+#' fs$big_text_splitter(input_path_file = "input.txt",
+#' output_path_folder = "/folder/output/",
+#' end_query = "endword", batches = 5,
+#' trimmed_line = FALSE)
#'
#'
#' #-------------
#' # file parser:
#' #-------------
#'
-#' # fs$big_text_parser(input_path_folder = "/folder/output/",
-#'
-#' # output_path_folder = "/folder/parser/",
-#'
-#' # start_query = "startword", end_query = "endword",
-#'
-#' # min_lines = 1, trimmed_line = TRUE)
+#' fs$big_text_parser(input_path_folder = "/folder/output/",
+#' output_path_folder = "/folder/parser/",
+#' start_query = "startword", end_query = "endword",
+#' min_lines = 1, trimmed_line = TRUE)
#'
#'
#' #----------------
@@ -682,32 +677,24 @@ utf_locale = function(language = "english") {
#' #----------------
#'
#'
-#' # fs$big_text_tokenizer(input_path_folder = "/folder/parser/",
-#'
-#' # batches = 5, split_string=TRUE,
-#'
-#' # to_lower = TRUE, trim_token = TRUE,
-#'
-#' # max_num_char = 100, remove_stopwords = TRUE,
-#'
-#' # stemmer = "porter2_stemmer", threads = 1,
-#'
-#' # path_2folder="/folder/output_token/",
-#'
-#' # vocabulary_path_folder="/folder/VOCAB/")
+#' fs$big_text_tokenizer(input_path_folder = "/folder/parser/",
+#' batches = 5, split_string=TRUE,
+#' to_lower = TRUE, trim_token = TRUE,
+#' max_num_char = 100, remove_stopwords = TRUE,
+#' stemmer = "porter2_stemmer", threads = 1,
+#' path_2folder="/folder/output_token/",
+#' vocabulary_path_folder="/folder/VOCAB/")
#'
#' #-------------------
#' # vocabulary counts:
#' #-------------------
#'
#'
-#' # fs$vocabulary_accumulator(input_path_folder = "/folder/VOCAB/",
-#'
-#' # vocabulary_path_file = "/folder/vocab.txt",
-#'
-#' # max_num_chars = 50)
-
-
+#' fs$vocabulary_accumulator(input_path_folder = "/folder/VOCAB/",
+#' vocabulary_path_file = "/folder/vocab.txt",
+#' max_num_chars = 50)
+#'
+#' }
big_tokenize_transform <- R6::R6Class("big_tokenize_transform",
@@ -1006,16 +993,16 @@ big_tokenize_transform <- R6::R6Class("big_tokenize_transform",
#' Stemming of the english language is done using the porter2-stemmer, for details see \url{https://github.com/smassung/porter2_stemmer}
#'
#' @examples
+#'
+#' \dontrun{
#'
#' library(textTinyR)
#'
-#' # vps = vocabulary_parser(input_path_file = '/folder/input_data.txt',
-#'
-#' # start_query = 'start_word', end_query = 'end_word',
-#'
-#' # vocabulary_path_file = '/folder/vocab.txt',
-#'
-#' # to_lower = TRUE, split_string = TRUE)
+#' vps = vocabulary_parser(input_path_file = '/folder/input_data.txt',
+#' start_query = 'start_word', end_query = 'end_word',
+#' vocabulary_path_file = '/folder/vocab.txt',
+#' to_lower = TRUE, split_string = TRUE)
+#' }
vocabulary_parser = function(input_path_file = NULL, start_query = NULL, end_query = NULL, vocabulary_path_file = NULL, min_lines = 1, trimmed_line = FALSE, to_lower = FALSE,
@@ -1146,10 +1133,14 @@ vocabulary_parser = function(input_path_file = NULL, start_query = NULL, end_que
#' @return a number
#' @export
#' @examples
+#'
+#' \dontrun{
#'
#' library(textTinyR)
#'
-#' # bc = bytes_converter(input_path_file = 'some_file.txt', unit = "MB")
+#' bc = bytes_converter(input_path_file = 'some_file.txt', unit = "MB")
+#'
+#' }
bytes_converter = function(input_path_file = NULL, unit = "MB") {
@@ -1178,41 +1169,36 @@ bytes_converter = function(input_path_file = NULL, unit = "MB") {
#' The text file should have a structure (such as an xml-structure), so that subsets can be extracted using the \emph{start_query} and \emph{end_query} parameters ( the same applies in case of a vector of character strings)
#' @export
#' @examples
+#'
+#' \dontrun{
#'
#' library(textTinyR)
#'
#' # In case that the 'input_path_file' is a valid path
#' #---------------------------------------------------
-#'
-#' # fp = text_file_parser(input_path_file = '/folder/input_data.txt',
-#'
-#' # output_path_file = '/folder/output_data.txt',
-#'
-#' # start_query = 'word_a', end_query = 'word_w',
-#'
-#' # min_lines = 1, trimmed_line = FALSE)
-#'
-#'
+#'
+#' fp = text_file_parser(input_path_file = '/folder/input_data.txt',
+#' output_path_file = '/folder/output_data.txt',
+#' start_query = 'word_a', end_query = 'word_w',
+#' min_lines = 1, trimmed_line = FALSE)
+#'
+#'
#' # In case that the 'input_path_file' is a character vector of strings
#' #--------------------------------------------------------------------
-#'
-#' # PATH_url = "https://FILE.xml"
-#'
-#' # con = url(PATH_url, method = "libcurl")
-#'
-#' # tmp_dat = read.delim(con, quote = "\"", comment.char = "", stringsAsFactors = FALSE)
-#'
-#' # vec_docs = unlist(lapply(1:length(as.vector(tmp_dat[, 1])), function(x)
-#'
-#' # trimws(tmp_dat[x, 1], which = "both")))
-#'
-#' # parse_data = text_file_parser(input_path_file = vec_docs,
-#'
-#' # start_query = c("", "", ""),
-#'
-#' # end_query = c("", "", ""),
-#'
-#' # min_lines = 1, trimmed_line = TRUE)
+#'
+#' PATH_url = "https://FILE.xml"
+#' con = url(PATH_url, method = "libcurl")
+#' tmp_dat = read.delim(con, quote = "\"", comment.char = "", stringsAsFactors = FALSE)
+#'
+#' vec_docs = unlist(lapply(1:length(as.vector(tmp_dat[, 1])), function(x)
+#' trimws(tmp_dat[x, 1], which = "both")))
+#'
+#' parse_data = text_file_parser(input_path_file = vec_docs,
+#' start_query = c("", "", ""),
+#' end_query = c("", "", ""),
+#' min_lines = 1, trimmed_line = TRUE)
+#'
+#' }
text_file_parser = function(input_path_file = NULL, output_path_file = "", start_query = NULL, end_query = NULL, min_lines = 1, trimmed_line = FALSE, verbose = FALSE) {
@@ -1403,7 +1389,6 @@ text_file_parser = function(input_path_file = NULL, output_path_file = "", start
#' # tk$print_words_lookup_tbl(n_gram = 'e_w')
-
token_stats <- R6::R6Class("token_stats",
public = list(
@@ -1726,9 +1711,6 @@ cosine_distance = function(sentence1, sentence2, split_separator = " ") {
}
-
-
-
#' Term matrices and statistics ( document-term-matrix, term-document-matrix)
#'
#'
@@ -1825,62 +1807,59 @@ cosine_distance = function(sentence1, sentence2, split_separator = " ") {
#'
#' # document_term_matrix = TRUE)
#' @examples
+#'
+#' \dontrun{
#'
#' library(textTinyR)
#'
#'
-#' # sm <- sparse_term_matrix$new(file_data = "/folder/my_data.txt",
-#'
-#' # document_term_matrix = TRUE)
-#'
+#' sm <- sparse_term_matrix$new(file_data = "/folder/my_data.txt",
+#' document_term_matrix = TRUE)
+#'
#' #--------------
#' # term matrix :
#' #--------------
-#'
-#' # sm$Term_Matrix(sort_terms = TRUE, to_lower = TRUE,
-#'
-#' # trim_token = TRUE, split_string = TRUE,
-#'
-#' # remove_stopwords = TRUE, normalize = 'l1',
-#'
-#' # stemmer = 'porter2_stemmer', threads = 1 )
-#'
+#'
+#' sm$Term_Matrix(sort_terms = TRUE, to_lower = TRUE,
+#' trim_token = TRUE, split_string = TRUE,
+#' remove_stopwords = TRUE, normalize = 'l1',
+#' stemmer = 'porter2_stemmer', threads = 1 )
+#'
#' #---------------
#' # triplet data :
#' #---------------
-#'
-#' # sm$triplet_data()
-#'
-#'
+#'
+#' sm$triplet_data()
+#'
+#'
#' #----------------------
#' # global-term-weights :
#' #----------------------
-#'
-#' # sm$global_term_weights()
-#'
-#'
+#'
+#' sm$global_term_weights()
+#'
+#'
#' #-------------------------
#' # removal of sparse terms:
#' #-------------------------
-#'
-#' # sm$Term_Matrix_Adjust(sparsity_thresh = 0.995)
-#'
-#'
+#'
+#' sm$Term_Matrix_Adjust(sparsity_thresh = 0.995)
+#'
+#'
#' #-----------------------------------------------
#' # associations between terms of a sparse matrix:
#' #-----------------------------------------------
-#'
-#'
-#' # sm$term_associations(Terms = c("word", "sentence"), keep_terms = 10)
-#'
-#'
+#'
+#' sm$term_associations(Terms = c("word", "sentence"), keep_terms = 10)
+#'
+#'
#' #---------------------------------------------
#' # most frequent terms using the sparse matrix:
#' #---------------------------------------------
-#'
-#'
-#' # sm$most_frequent_terms(keep_terms = 10, threads = 1)
-
+#'
+#' sm$most_frequent_terms(keep_terms = 10, threads = 1)
+#'
+#' }
sparse_term_matrix <- R6::R6Class("sparse_term_matrix",
@@ -2509,8 +2488,6 @@ sparse_Means = function(sparse_matrix, rowMeans = FALSE) {
}
-
-
#' sparsity percentage of a sparse matrix
#'
#'
@@ -2536,7 +2513,6 @@ matrix_sparsity = function(sparse_matrix) {
}
-
#' save a sparse matrix in binary format
#'
#'
@@ -2567,7 +2543,6 @@ save_sparse_binary = function(sparse_matrix, file_name = "save_sparse.mat") {
}
-
#' load a sparse matrix in binary format
#'
#'
@@ -2576,9 +2551,13 @@ save_sparse_binary = function(sparse_matrix, file_name = "save_sparse.mat") {
#' @export
#' @examples
#'
+#' \dontrun{
+#'
#' library(textTinyR)
#'
-#' # load_sparse_binary(file_name = "save_sparse.mat")
+#' load_sparse_binary(file_name = "save_sparse.mat")
+#'
+#' }
load_sparse_binary = function(file_name = "save_sparse.mat") {
@@ -2591,8 +2570,6 @@ load_sparse_binary = function(file_name = "save_sparse.mat") {
}
-
-
#' read a specific number of characters from a text file
#'
#'
@@ -2601,10 +2578,14 @@ load_sparse_binary = function(file_name = "save_sparse.mat") {
#' @param write_2file either an empty string ("") or a character string specifying a valid output file to write the subset of the input file
#' @export
#' @examples
+#'
+#' \dontrun{
#'
#' library(textTinyR)
#'
-#' # txfl = read_characters(input_file = 'input.txt', characters = 100)
+#' txfl = read_characters(input_file = 'input.txt', characters = 100)
+#'
+#' }
read_characters = function(input_file = NULL, characters = 100, write_2file = "") {
@@ -2638,10 +2619,14 @@ read_characters = function(input_file = NULL, characters = 100, write_2file = ""
#' @param write_2file either "" or a character string specifying a valid output file to write the subset of the input file
#' @export
#' @examples
+#'
+#' \dontrun{
#'
#' library(textTinyR)
#'
-#' # txfl = read_rows(input_file = 'input.txt', rows = 100)
+#' txfl = read_rows(input_file = 'input.txt', rows = 100)
+#'
+#' }
read_rows = function(input_file = NULL, read_delimiter = "\n", rows = 100, write_2file = "") {
@@ -2665,12 +2650,8 @@ read_rows = function(input_file = NULL, read_delimiter = "\n", rows = 100, write
}
-
#------------------------------------------------------------------------------------------------------------------------------------------------------------ Word-Vector-Utility functions
-
-
-
#' dimensions of a word vectors file
#'
#' @param input_file a character string specifying a valid path to a text file
@@ -3325,3 +3306,22 @@ cluster_frequency = function(tokenized_list_text, cluster_vector, verbose = FALS
}
+#' Compute batches
+#'
+#' @param n_rows a numeric specifying the number of rows
+#' @param n_batches a numeric specifying the number of output batches
+#' @return a list
+#' @export
+#' @examples
+#'
+#' library(textTinyR)
+#'
+#' btch = batch_compute(n_rows = 1000, n_batches = 10)
+
+
+batch_compute = function(n_rows, n_batches) {
+ if (!inherits(n_rows, c('numeric', 'integer'))) stop("the 'n_rows' parameter should be of type either numeric or integer", call. = F)
+ if (!inherits(n_batches, c('numeric', 'integer'))) stop("the 'n_batches' parameter should be of type either numeric or integer", call. = F)
+
+ return(batch_calculation(n_rows, n_batches))
+}
diff --git a/README.md b/README.md
index 5e7816a..efcaa8b 100644
--- a/README.md
+++ b/README.md
@@ -99,7 +99,7 @@ chmod -R 777 /home/YOUR_DIR
-The **USER** defaults to *rstudio* but you have to give your **PASSWORD** of preference (see [www.rocker-project.org](https://www.rocker-project.org/) for more information).
+The **USER** defaults to *rstudio* but you have to give your **PASSWORD** of preference (see [https://rocker-project.org/](https://rocker-project.org/) for more information).
@@ -144,7 +144,7 @@ If you use the code of this repository in your paper or research please cite bot
title = {{textTinyR}: Text Processing for Small or Big Data Files},
author = {Lampros Mouselimis},
year = {2021},
- note = {R package version 1.1.7},
+ note = {R package version 1.1.8},
url = {https://CRAN.R-project.org/package=textTinyR},
}
```
diff --git a/man/Doc2Vec.Rd b/man/Doc2Vec.Rd
index 8904a4c..db48ec5 100644
--- a/man/Doc2Vec.Rd
+++ b/man/Doc2Vec.Rd
@@ -76,15 +76,15 @@ out = init$doc2vec_methods(method = "sum_sqrt")
\section{Methods}{
\subsection{Public methods}{
\itemize{
-\item \href{#method-new}{\code{Doc2Vec$new()}}
-\item \href{#method-doc2vec_methods}{\code{Doc2Vec$doc2vec_methods()}}
-\item \href{#method-pre_processed_wv}{\code{Doc2Vec$pre_processed_wv()}}
-\item \href{#method-clone}{\code{Doc2Vec$clone()}}
+\item \href{#method-documents_to_wordvectors-new}{\code{Doc2Vec$new()}}
+\item \href{#method-documents_to_wordvectors-doc2vec_methods}{\code{Doc2Vec$doc2vec_methods()}}
+\item \href{#method-documents_to_wordvectors-pre_processed_wv}{\code{Doc2Vec$pre_processed_wv()}}
+\item \href{#method-documents_to_wordvectors-clone}{\code{Doc2Vec$clone()}}
}
}
\if{html}{\out{
}}
-\if{html}{\out{}}
-\if{latex}{\out{\hypertarget{method-new}{}}}
+\if{html}{\out{}}
+\if{latex}{\out{\hypertarget{method-documents_to_wordvectors-new}{}}}
\subsection{Method \code{new()}}{
\subsection{Usage}{
\if{html}{\out{}}\preformatted{Doc2Vec$new(
@@ -113,8 +113,8 @@ out = init$doc2vec_methods(method = "sum_sqrt")
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-doc2vec_methods}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-documents_to_wordvectors-doc2vec_methods}{}}}
\subsection{Method \code{doc2vec_methods()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{Doc2Vec$doc2vec_methods(
@@ -137,8 +137,8 @@ out = init$doc2vec_methods(method = "sum_sqrt")
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-pre_processed_wv}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-documents_to_wordvectors-pre_processed_wv}{}}}
\subsection{Method \code{pre_processed_wv()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{Doc2Vec$pre_processed_wv()}\if{html}{\out{
}}
@@ -146,8 +146,8 @@ out = init$doc2vec_methods(method = "sum_sqrt")
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-clone}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-documents_to_wordvectors-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
diff --git a/man/batch_compute.Rd b/man/batch_compute.Rd
new file mode 100644
index 0000000..bd5e72a
--- /dev/null
+++ b/man/batch_compute.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{batch_compute}
+\alias{batch_compute}
+\title{Compute batches}
+\usage{
+batch_compute(n_rows, n_batches)
+}
+\arguments{
+\item{n_rows}{a numeric specifying the number of rows}
+
+\item{n_batches}{a numeric specifying the number of output batches}
+}
+\value{
+a list
+}
+\description{
+Compute batches
+}
+\examples{
+
+library(textTinyR)
+
+btch = batch_compute(n_rows = 1000, n_batches = 10)
+}
diff --git a/man/big_tokenize_transform.Rd b/man/big_tokenize_transform.Rd
index fa374a3..fa5edf5 100644
--- a/man/big_tokenize_transform.Rd
+++ b/man/big_tokenize_transform.Rd
@@ -49,35 +49,30 @@ The \emph{ngram_sequential} or \emph{ngram_overlap} stemming method applies to e
\examples{
-library(textTinyR)
+\dontrun{
+library(textTinyR)
-# fs <- big_tokenize_transform$new(verbose = FALSE)
+fs <- big_tokenize_transform$new(verbose = FALSE)
#---------------
# file splitter:
#---------------
-# fs$big_text_splitter(input_path_file = "input.txt",
-
-# output_path_folder = "/folder/output/",
-
-# end_query = "endword", batches = 5,
-
-# trimmed_line = FALSE)
+fs$big_text_splitter(input_path_file = "input.txt",
+ output_path_folder = "/folder/output/",
+ end_query = "endword", batches = 5,
+ trimmed_line = FALSE)
#-------------
# file parser:
#-------------
-# fs$big_text_parser(input_path_folder = "/folder/output/",
-
-# output_path_folder = "/folder/parser/",
-
-# start_query = "startword", end_query = "endword",
-
-# min_lines = 1, trimmed_line = TRUE)
+fs$big_text_parser(input_path_folder = "/folder/output/",
+ output_path_folder = "/folder/parser/",
+ start_query = "startword", end_query = "endword",
+ min_lines = 1, trimmed_line = TRUE)
#----------------
@@ -85,45 +80,39 @@ library(textTinyR)
#----------------
-# fs$big_text_tokenizer(input_path_folder = "/folder/parser/",
-
-# batches = 5, split_string=TRUE,
-
-# to_lower = TRUE, trim_token = TRUE,
-
-# max_num_char = 100, remove_stopwords = TRUE,
-
-# stemmer = "porter2_stemmer", threads = 1,
-
-# path_2folder="/folder/output_token/",
-
-# vocabulary_path_folder="/folder/VOCAB/")
+ fs$big_text_tokenizer(input_path_folder = "/folder/parser/",
+ batches = 5, split_string=TRUE,
+ to_lower = TRUE, trim_token = TRUE,
+ max_num_char = 100, remove_stopwords = TRUE,
+ stemmer = "porter2_stemmer", threads = 1,
+ path_2folder="/folder/output_token/",
+ vocabulary_path_folder="/folder/VOCAB/")
#-------------------
# vocabulary counts:
#-------------------
-# fs$vocabulary_accumulator(input_path_folder = "/folder/VOCAB/",
+fs$vocabulary_accumulator(input_path_folder = "/folder/VOCAB/",
+ vocabulary_path_file = "/folder/vocab.txt",
+ max_num_chars = 50)
-# vocabulary_path_file = "/folder/vocab.txt",
-
-# max_num_chars = 50)
+}
}
\section{Methods}{
\subsection{Public methods}{
\itemize{
-\item \href{#method-new}{\code{big_tokenize_transform$new()}}
-\item \href{#method-big_text_splitter}{\code{big_tokenize_transform$big_text_splitter()}}
-\item \href{#method-big_text_parser}{\code{big_tokenize_transform$big_text_parser()}}
-\item \href{#method-big_text_tokenizer}{\code{big_tokenize_transform$big_text_tokenizer()}}
-\item \href{#method-vocabulary_accumulator}{\code{big_tokenize_transform$vocabulary_accumulator()}}
-\item \href{#method-clone}{\code{big_tokenize_transform$clone()}}
+\item \href{#method-big_tokenize_transform-new}{\code{big_tokenize_transform$new()}}
+\item \href{#method-big_tokenize_transform-big_text_splitter}{\code{big_tokenize_transform$big_text_splitter()}}
+\item \href{#method-big_tokenize_transform-big_text_parser}{\code{big_tokenize_transform$big_text_parser()}}
+\item \href{#method-big_tokenize_transform-big_text_tokenizer}{\code{big_tokenize_transform$big_text_tokenizer()}}
+\item \href{#method-big_tokenize_transform-vocabulary_accumulator}{\code{big_tokenize_transform$vocabulary_accumulator()}}
+\item \href{#method-big_tokenize_transform-clone}{\code{big_tokenize_transform$clone()}}
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-new}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-big_tokenize_transform-new}{}}}
\subsection{Method \code{new()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{big_tokenize_transform$new(verbose = FALSE)}\if{html}{\out{
}}
@@ -138,8 +127,8 @@ library(textTinyR)
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-big_text_splitter}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-big_tokenize_transform-big_text_splitter}{}}}
\subsection{Method \code{big_text_splitter()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{big_tokenize_transform$big_text_splitter(
@@ -168,8 +157,8 @@ library(textTinyR)
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-big_text_parser}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-big_tokenize_transform-big_text_parser}{}}}
\subsection{Method \code{big_text_parser()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{big_tokenize_transform$big_text_parser(
@@ -201,8 +190,8 @@ library(textTinyR)
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-big_text_tokenizer}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-big_tokenize_transform-big_text_tokenizer}{}}}
\subsection{Method \code{big_text_tokenizer()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{big_tokenize_transform$big_text_tokenizer(
@@ -321,8 +310,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-vocabulary_accumulator}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-big_tokenize_transform-vocabulary_accumulator}{}}}
\subsection{Method \code{vocabulary_accumulator()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{big_tokenize_transform$vocabulary_accumulator(
@@ -345,8 +334,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-clone}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-big_tokenize_transform-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
diff --git a/man/bytes_converter.Rd b/man/bytes_converter.Rd
index c315c03..310da9f 100644
--- a/man/bytes_converter.Rd
+++ b/man/bytes_converter.Rd
@@ -19,7 +19,11 @@ bytes converter of a text file ( KB, MB or GB )
}
\examples{
+\dontrun{
+
library(textTinyR)
-# bc = bytes_converter(input_path_file = 'some_file.txt', unit = "MB")
+bc = bytes_converter(input_path_file = 'some_file.txt', unit = "MB")
+
+}
}
diff --git a/man/load_sparse_binary.Rd b/man/load_sparse_binary.Rd
index d50a70a..cb40884 100644
--- a/man/load_sparse_binary.Rd
+++ b/man/load_sparse_binary.Rd
@@ -17,7 +17,11 @@ load a sparse matrix in binary format
}
\examples{
+\dontrun{
+
library(textTinyR)
-# load_sparse_binary(file_name = "save_sparse.mat")
+load_sparse_binary(file_name = "save_sparse.mat")
+
+}
}
diff --git a/man/read_characters.Rd b/man/read_characters.Rd
index e14c773..4928381 100644
--- a/man/read_characters.Rd
+++ b/man/read_characters.Rd
@@ -18,7 +18,11 @@ read a specific number of characters from a text file
}
\examples{
+\dontrun{
+
library(textTinyR)
-# txfl = read_characters(input_file = 'input.txt', characters = 100)
+txfl = read_characters(input_file = 'input.txt', characters = 100)
+
+}
}
diff --git a/man/read_rows.Rd b/man/read_rows.Rd
index 65ad6af..8bed41e 100644
--- a/man/read_rows.Rd
+++ b/man/read_rows.Rd
@@ -25,7 +25,11 @@ read a specific number of rows from a text file
}
\examples{
+\dontrun{
+
library(textTinyR)
-# txfl = read_rows(input_file = 'input.txt', rows = 100)
+txfl = read_rows(input_file = 'input.txt', rows = 100)
+
+}
}
diff --git a/man/sparse_term_matrix.Rd b/man/sparse_term_matrix.Rd
index c9bb187..7e2cfcf 100644
--- a/man/sparse_term_matrix.Rd
+++ b/man/sparse_term_matrix.Rd
@@ -63,77 +63,75 @@ Stemming of the english language is done using the porter2-stemmer, for details
\examples{
-library(textTinyR)
+\dontrun{
+library(textTinyR)
-# sm <- sparse_term_matrix$new(file_data = "/folder/my_data.txt",
-# document_term_matrix = TRUE)
+sm <- sparse_term_matrix$new(file_data = "/folder/my_data.txt",
+ document_term_matrix = TRUE)
#--------------
# term matrix :
#--------------
-# sm$Term_Matrix(sort_terms = TRUE, to_lower = TRUE,
-
-# trim_token = TRUE, split_string = TRUE,
-
-# remove_stopwords = TRUE, normalize = 'l1',
-
-# stemmer = 'porter2_stemmer', threads = 1 )
+sm$Term_Matrix(sort_terms = TRUE, to_lower = TRUE,
+ trim_token = TRUE, split_string = TRUE,
+ remove_stopwords = TRUE, normalize = 'l1',
+ stemmer = 'porter2_stemmer', threads = 1 )
#---------------
# triplet data :
#---------------
-# sm$triplet_data()
+sm$triplet_data()
#----------------------
# global-term-weights :
#----------------------
-# sm$global_term_weights()
+sm$global_term_weights()
#-------------------------
# removal of sparse terms:
#-------------------------
-# sm$Term_Matrix_Adjust(sparsity_thresh = 0.995)
+sm$Term_Matrix_Adjust(sparsity_thresh = 0.995)
#-----------------------------------------------
# associations between terms of a sparse matrix:
#-----------------------------------------------
-
-# sm$term_associations(Terms = c("word", "sentence"), keep_terms = 10)
+sm$term_associations(Terms = c("word", "sentence"), keep_terms = 10)
#---------------------------------------------
# most frequent terms using the sparse matrix:
#---------------------------------------------
+sm$most_frequent_terms(keep_terms = 10, threads = 1)
-# sm$most_frequent_terms(keep_terms = 10, threads = 1)
+}
}
\section{Methods}{
\subsection{Public methods}{
\itemize{
-\item \href{#method-new}{\code{sparse_term_matrix$new()}}
-\item \href{#method-Term_Matrix}{\code{sparse_term_matrix$Term_Matrix()}}
-\item \href{#method-triplet_data}{\code{sparse_term_matrix$triplet_data()}}
-\item \href{#method-global_term_weights}{\code{sparse_term_matrix$global_term_weights()}}
-\item \href{#method-Term_Matrix_Adjust}{\code{sparse_term_matrix$Term_Matrix_Adjust()}}
-\item \href{#method-term_associations}{\code{sparse_term_matrix$term_associations()}}
-\item \href{#method-most_frequent_terms}{\code{sparse_term_matrix$most_frequent_terms()}}
-\item \href{#method-clone}{\code{sparse_term_matrix$clone()}}
+\item \href{#method-sparse_term_matrix-new}{\code{sparse_term_matrix$new()}}
+\item \href{#method-sparse_term_matrix-Term_Matrix}{\code{sparse_term_matrix$Term_Matrix()}}
+\item \href{#method-sparse_term_matrix-triplet_data}{\code{sparse_term_matrix$triplet_data()}}
+\item \href{#method-sparse_term_matrix-global_term_weights}{\code{sparse_term_matrix$global_term_weights()}}
+\item \href{#method-sparse_term_matrix-Term_Matrix_Adjust}{\code{sparse_term_matrix$Term_Matrix_Adjust()}}
+\item \href{#method-sparse_term_matrix-term_associations}{\code{sparse_term_matrix$term_associations()}}
+\item \href{#method-sparse_term_matrix-most_frequent_terms}{\code{sparse_term_matrix$most_frequent_terms()}}
+\item \href{#method-sparse_term_matrix-clone}{\code{sparse_term_matrix$clone()}}
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-new}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-sparse_term_matrix-new}{}}}
\subsection{Method \code{new()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{sparse_term_matrix$new(
@@ -156,8 +154,8 @@ library(textTinyR)
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-Term_Matrix}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-sparse_term_matrix-Term_Matrix}{}}}
\subsection{Method \code{Term_Matrix()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{sparse_term_matrix$Term_Matrix(
@@ -255,8 +253,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-triplet_data}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-sparse_term_matrix-triplet_data}{}}}
\subsection{Method \code{triplet_data()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{sparse_term_matrix$triplet_data()}\if{html}{\out{
}}
@@ -264,8 +262,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-global_term_weights}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-sparse_term_matrix-global_term_weights}{}}}
\subsection{Method \code{global_term_weights()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{sparse_term_matrix$global_term_weights()}\if{html}{\out{
}}
@@ -273,8 +271,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-Term_Matrix_Adjust}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-sparse_term_matrix-Term_Matrix_Adjust}{}}}
\subsection{Method \code{Term_Matrix_Adjust()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{sparse_term_matrix$Term_Matrix_Adjust(sparsity_thresh = 1)}\if{html}{\out{
}}
@@ -289,8 +287,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-term_associations}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-sparse_term_matrix-term_associations}{}}}
\subsection{Method \code{term_associations()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{sparse_term_matrix$term_associations(
@@ -313,8 +311,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-most_frequent_terms}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-sparse_term_matrix-most_frequent_terms}{}}}
\subsection{Method \code{most_frequent_terms()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{sparse_term_matrix$most_frequent_terms(
@@ -337,8 +335,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-clone}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-sparse_term_matrix-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
diff --git a/man/text_file_parser.Rd b/man/text_file_parser.Rd
index 5237dd6..2bd9535 100644
--- a/man/text_file_parser.Rd
+++ b/man/text_file_parser.Rd
@@ -37,38 +37,33 @@ The text file should have a structure (such as an xml-structure), so that subset
}
\examples{
+\dontrun{
+
library(textTinyR)
# In case that the 'input_path_file' is a valid path
#---------------------------------------------------
-# fp = text_file_parser(input_path_file = '/folder/input_data.txt',
-
-# output_path_file = '/folder/output_data.txt',
-
-# start_query = 'word_a', end_query = 'word_w',
-
-# min_lines = 1, trimmed_line = FALSE)
+fp = text_file_parser(input_path_file = '/folder/input_data.txt',
+ output_path_file = '/folder/output_data.txt',
+ start_query = 'word_a', end_query = 'word_w',
+ min_lines = 1, trimmed_line = FALSE)
# In case that the 'input_path_file' is a character vector of strings
#--------------------------------------------------------------------
-# PATH_url = "https://FILE.xml"
-
-# con = url(PATH_url, method = "libcurl")
-
-# tmp_dat = read.delim(con, quote = "\"", comment.char = "", stringsAsFactors = FALSE)
+PATH_url = "https://FILE.xml"
+con = url(PATH_url, method = "libcurl")
+tmp_dat = read.delim(con, quote = "\"", comment.char = "", stringsAsFactors = FALSE)
-# vec_docs = unlist(lapply(1:length(as.vector(tmp_dat[, 1])), function(x)
+vec_docs = unlist(lapply(1:length(as.vector(tmp_dat[, 1])), function(x)
+ trimws(tmp_dat[x, 1], which = "both")))
-# trimws(tmp_dat[x, 1], which = "both")))
-
-# parse_data = text_file_parser(input_path_file = vec_docs,
-
-# start_query = c("
", "", ""),
-
-# end_query = c("", "", ""),
-
-# min_lines = 1, trimmed_line = TRUE)
+parse_data = text_file_parser(input_path_file = vec_docs,
+ start_query = c("
", "", ""),
+ end_query = c("", "", ""),
+ min_lines = 1, trimmed_line = TRUE)
+
+}
}
diff --git a/man/text_intersect.Rd b/man/text_intersect.Rd
index fc853fd..a79c84c 100644
--- a/man/text_intersect.Rd
+++ b/man/text_intersect.Rd
@@ -61,15 +61,15 @@ https://www.kaggle.com/c/home-depot-product-search-relevance/discussion/20427 by
\section{Methods}{
\subsection{Public methods}{
\itemize{
-\item \href{#method-new}{\code{text_intersect$new()}}
-\item \href{#method-count_intersect}{\code{text_intersect$count_intersect()}}
-\item \href{#method-ratio_intersect}{\code{text_intersect$ratio_intersect()}}
-\item \href{#method-clone}{\code{text_intersect$clone()}}
+\item \href{#method-text_intersect-new}{\code{text_intersect$new()}}
+\item \href{#method-text_intersect-count_intersect}{\code{text_intersect$count_intersect()}}
+\item \href{#method-text_intersect-ratio_intersect}{\code{text_intersect$ratio_intersect()}}
+\item \href{#method-text_intersect-clone}{\code{text_intersect$clone()}}
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-new}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-text_intersect-new}{}}}
\subsection{Method \code{new()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{text_intersect$new(token_list1 = NULL, token_list2 = NULL)}\if{html}{\out{
}}
@@ -86,8 +86,8 @@ https://www.kaggle.com/c/home-depot-product-search-relevance/discussion/20427 by
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-count_intersect}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-text_intersect-count_intersect}{}}}
\subsection{Method \code{count_intersect()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{text_intersect$count_intersect(distinct = FALSE, letters = FALSE)}\if{html}{\out{
}}
@@ -104,8 +104,8 @@ https://www.kaggle.com/c/home-depot-product-search-relevance/discussion/20427 by
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-ratio_intersect}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-text_intersect-ratio_intersect}{}}}
\subsection{Method \code{ratio_intersect()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{text_intersect$ratio_intersect(distinct = FALSE, letters = FALSE)}\if{html}{\out{
}}
@@ -122,8 +122,8 @@ https://www.kaggle.com/c/home-depot-product-search-relevance/discussion/20427 by
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-clone}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-text_intersect-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
diff --git a/man/token_stats.Rd b/man/token_stats.Rd
index 025d957..551a58c 100644
--- a/man/token_stats.Rd
+++ b/man/token_stats.Rd
@@ -129,23 +129,23 @@ lut <- tk$look_up_table(n_grams = 3)
\section{Methods}{
\subsection{Public methods}{
\itemize{
-\item \href{#method-new}{\code{token_stats$new()}}
-\item \href{#method-path_2vector}{\code{token_stats$path_2vector()}}
-\item \href{#method-freq_distribution}{\code{token_stats$freq_distribution()}}
-\item \href{#method-print_frequency}{\code{token_stats$print_frequency()}}
-\item \href{#method-count_character}{\code{token_stats$count_character()}}
-\item \href{#method-print_count_character}{\code{token_stats$print_count_character()}}
-\item \href{#method-collocation_words}{\code{token_stats$collocation_words()}}
-\item \href{#method-print_collocations}{\code{token_stats$print_collocations()}}
-\item \href{#method-string_dissimilarity_matrix}{\code{token_stats$string_dissimilarity_matrix()}}
-\item \href{#method-look_up_table}{\code{token_stats$look_up_table()}}
-\item \href{#method-print_words_lookup_tbl}{\code{token_stats$print_words_lookup_tbl()}}
-\item \href{#method-clone}{\code{token_stats$clone()}}
+\item \href{#method-token_stats-new}{\code{token_stats$new()}}
+\item \href{#method-token_stats-path_2vector}{\code{token_stats$path_2vector()}}
+\item \href{#method-token_stats-freq_distribution}{\code{token_stats$freq_distribution()}}
+\item \href{#method-token_stats-print_frequency}{\code{token_stats$print_frequency()}}
+\item \href{#method-token_stats-count_character}{\code{token_stats$count_character()}}
+\item \href{#method-token_stats-print_count_character}{\code{token_stats$print_count_character()}}
+\item \href{#method-token_stats-collocation_words}{\code{token_stats$collocation_words()}}
+\item \href{#method-token_stats-print_collocations}{\code{token_stats$print_collocations()}}
+\item \href{#method-token_stats-string_dissimilarity_matrix}{\code{token_stats$string_dissimilarity_matrix()}}
+\item \href{#method-token_stats-look_up_table}{\code{token_stats$look_up_table()}}
+\item \href{#method-token_stats-print_words_lookup_tbl}{\code{token_stats$print_words_lookup_tbl()}}
+\item \href{#method-token_stats-clone}{\code{token_stats$clone()}}
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-new}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-new}{}}}
\subsection{Method \code{new()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{token_stats$new(
@@ -174,8 +174,8 @@ lut <- tk$look_up_table(n_grams = 3)
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-path_2vector}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-path_2vector}{}}}
\subsection{Method \code{path_2vector()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{token_stats$path_2vector()}\if{html}{\out{
}}
@@ -183,8 +183,8 @@ lut <- tk$look_up_table(n_grams = 3)
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-freq_distribution}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-freq_distribution}{}}}
\subsection{Method \code{freq_distribution()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{token_stats$freq_distribution()}\if{html}{\out{
}}
@@ -192,8 +192,8 @@ lut <- tk$look_up_table(n_grams = 3)
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-print_frequency}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-print_frequency}{}}}
\subsection{Method \code{print_frequency()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{token_stats$print_frequency(subset = NULL)}\if{html}{\out{
}}
@@ -208,8 +208,8 @@ lut <- tk$look_up_table(n_grams = 3)
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-count_character}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-count_character}{}}}
\subsection{Method \code{count_character()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{token_stats$count_character()}\if{html}{\out{
}}
@@ -217,8 +217,8 @@ lut <- tk$look_up_table(n_grams = 3)
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-print_count_character}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-print_count_character}{}}}
\subsection{Method \code{print_count_character()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{token_stats$print_count_character(number = NULL)}\if{html}{\out{
}}
@@ -233,8 +233,8 @@ lut <- tk$look_up_table(n_grams = 3)
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-collocation_words}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-collocation_words}{}}}
\subsection{Method \code{collocation_words()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{token_stats$collocation_words()}\if{html}{\out{
}}
@@ -242,8 +242,8 @@ lut <- tk$look_up_table(n_grams = 3)
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-print_collocations}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-print_collocations}{}}}
\subsection{Method \code{print_collocations()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{token_stats$print_collocations(word = NULL)}\if{html}{\out{
}}
@@ -258,8 +258,8 @@ lut <- tk$look_up_table(n_grams = 3)
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-string_dissimilarity_matrix}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-string_dissimilarity_matrix}{}}}
\subsection{Method \code{string_dissimilarity_matrix()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{token_stats$string_dissimilarity_matrix(
@@ -294,8 +294,8 @@ lut <- tk$look_up_table(n_grams = 3)
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-look_up_table}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-look_up_table}{}}}
\subsection{Method \code{look_up_table()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{token_stats$look_up_table(n_grams = NULL)}\if{html}{\out{
}}
@@ -310,8 +310,8 @@ lut <- tk$look_up_table(n_grams = 3)
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-print_words_lookup_tbl}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-print_words_lookup_tbl}{}}}
\subsection{Method \code{print_words_lookup_tbl()}}{
\subsection{Usage}{
\if{html}{\out{
}}\preformatted{token_stats$print_words_lookup_tbl(n_gram = NULL)}\if{html}{\out{
}}
@@ -326,8 +326,8 @@ lut <- tk$look_up_table(n_grams = 3)
}
}
\if{html}{\out{
}}
-\if{html}{\out{
}}
-\if{latex}{\out{\hypertarget{method-clone}{}}}
+\if{html}{\out{
}}
+\if{latex}{\out{\hypertarget{method-token_stats-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
diff --git a/man/vocabulary_parser.Rd b/man/vocabulary_parser.Rd
index 23f1133..160a4bf 100644
--- a/man/vocabulary_parser.Rd
+++ b/man/vocabulary_parser.Rd
@@ -110,13 +110,13 @@ Stemming of the english language is done using the porter2-stemmer, for details
}
\examples{
-library(textTinyR)
-
-# vps = vocabulary_parser(input_path_file = '/folder/input_data.txt',
-
-# start_query = 'start_word', end_query = 'end_word',
+\dontrun{
-# vocabulary_path_file = '/folder/vocab.txt',
+library(textTinyR)
-# to_lower = TRUE, split_string = TRUE)
+ vps = vocabulary_parser(input_path_file = '/folder/input_data.txt',
+ start_query = 'start_word', end_query = 'end_word',
+ vocabulary_path_file = '/folder/vocab.txt',
+ to_lower = TRUE, split_string = TRUE)
+}
}
diff --git a/src/Makevars b/src/Makevars
index 9ec610c..0dd6477 100644
--- a/src/Makevars
+++ b/src/Makevars
@@ -1,5 +1,4 @@
PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD
PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS)
-CXX_STD = CXX11
PKG_CPPFLAGS = -I../inst/include/
diff --git a/src/Makevars.win b/src/Makevars.win
index bfe6a5f..ca18a35 100644
--- a/src/Makevars.win
+++ b/src/Makevars.win
@@ -1,4 +1,3 @@
-PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD
-PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS) -mthreads
-CXX_STD = CXX11
-PKG_CPPFLAGS = -I../inst/include/
+PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD
+PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS)
+PKG_CPPFLAGS = -I../inst/include/
diff --git a/src/export_all_funcs.cpp b/src/export_all_funcs.cpp
index d6c076f..9790010 100644
--- a/src/export_all_funcs.cpp
+++ b/src/export_all_funcs.cpp
@@ -1,7 +1,6 @@
# include
// [[Rcpp::depends("RcppArmadillo")]]
// [[Rcpp::plugins(openmp)]]
-// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::depends(BH)]]
diff --git a/src/init.c b/src/init.c
index 5a45cb8..53cae54 100644
--- a/src/init.c
+++ b/src/init.c
@@ -8,69 +8,69 @@
*/
/* .Call calls */
-extern SEXP _textTinyR_Adj_Sparsity(SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_append_data(SEXP, SEXP);
-extern SEXP _textTinyR_Associations_Cpp(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_batch_2file(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_batch_calculation(SEXP, SEXP);
-extern SEXP _textTinyR_big_parser(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_big_splitter_bytes(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_big_tokenize(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_Collocations_ngrams(SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_convert_bytes(SEXP, SEXP);
-extern SEXP _textTinyR_COR_MATR(SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_COS(SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_cosine_dist(SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_Cosine_dist(SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_Count_characters(SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_count_rows(SEXP, SEXP);
-extern SEXP _textTinyR_COUNTS_INTERSECT(SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_dense_2sparse_mat(SEXP);
-extern SEXP _textTinyR_DICE(SEXP, SEXP);
-extern SEXP _textTinyR_Dice_similarity(SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_Dissimilarity_mat(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_DIST(SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_DISTINCT_WORD_INTERSECT(SEXP, SEXP);
-extern SEXP _textTinyR_file_parser(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_Frequency_distribution(SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_idf_global_term_weights(SEXP, SEXP);
-extern SEXP _textTinyR_inner_cm(SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_inner_jd(SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_inner_reduce_dims(SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_INTERSECT(SEXP, SEXP);
-extern SEXP _textTinyR_JACCARD(SEXP, SEXP);
-extern SEXP _textTinyR_jaccard_dice(SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_keep_idxs(SEXP, SEXP);
-extern SEXP _textTinyR_Levenshtein_dist(SEXP, SEXP);
-extern SEXP _textTinyR_load_sparse_(SEXP);
-extern SEXP _textTinyR_Look_up_tbl(SEXP, SEXP);
-extern SEXP _textTinyR_modulus(SEXP, SEXP);
-extern SEXP _textTinyR_Most_Freq_Terms(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_Not_Duplicated(SEXP);
-extern SEXP _textTinyR_NUM_LETTERS_DISTINCT(SEXP);
-extern SEXP _textTinyR_Path_2vector(SEXP, SEXP);
-extern SEXP _textTinyR_RATIO_DISTINCT(SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_read_CHARS(SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_read_ROWS(SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_read_ROWS_wv(SEXP, SEXP);
-extern SEXP _textTinyR_reduce_dims_with_correlation(SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_reduced_word_vectors(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_res_term_matrix(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_res_token(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_res_token_list(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_res_token_vector(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_save_sparse_(SEXP, SEXP);
-extern SEXP _textTinyR_sparsity_float(SEXP);
-extern SEXP _textTinyR_sp_means(SEXP, SEXP);
-extern SEXP _textTinyR_sp_sums(SEXP, SEXP);
-extern SEXP _textTinyR_sublist(SEXP, SEXP);
-extern SEXP _textTinyR_tf_idf_exclude(SEXP, SEXP);
-extern SEXP _textTinyR_UNION(SEXP, SEXP);
-extern SEXP _textTinyR_UNIQUE(SEXP);
-extern SEXP _textTinyR_vec_parser(SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_vocabulary_counts(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_vocabulary_counts_big_tokenize(SEXP, SEXP, SEXP, SEXP);
-extern SEXP _textTinyR_word_vectors_methods(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP _textTinyR_Adj_Sparsity(void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_append_data(void *, void *);
+extern SEXP _textTinyR_Associations_Cpp(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_batch_2file(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_batch_calculation(void *, void *);
+extern SEXP _textTinyR_big_parser(void *, void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_big_splitter_bytes(void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_big_tokenize(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_Collocations_ngrams(void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_convert_bytes(void *, void *);
+extern SEXP _textTinyR_COR_MATR(void *, void *, void *);
+extern SEXP _textTinyR_COS(void *, void *, void *, void *);
+extern SEXP _textTinyR_cosine_dist(void *, void *, void *);
+extern SEXP _textTinyR_Cosine_dist(void *, void *, void *);
+extern SEXP _textTinyR_Count_characters(void *, void *, void *, void *);
+extern SEXP _textTinyR_count_rows(void *, void *);
+extern SEXP _textTinyR_COUNTS_INTERSECT(void *, void *, void *, void *);
+extern SEXP _textTinyR_dense_2sparse_mat(void *);
+extern SEXP _textTinyR_DICE(void *, void *);
+extern SEXP _textTinyR_Dice_similarity(void *, void *, void *);
+extern SEXP _textTinyR_Dissimilarity_mat(void *, void *, void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_DIST(void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_DISTINCT_WORD_INTERSECT(void *, void *);
+extern SEXP _textTinyR_file_parser(void *, void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_Frequency_distribution(void *, void *, void *, void *);
+extern SEXP _textTinyR_idf_global_term_weights(void *, void *);
+extern SEXP _textTinyR_inner_cm(void *, void *, void *);
+extern SEXP _textTinyR_inner_jd(void *, void *, void *, void *);
+extern SEXP _textTinyR_inner_reduce_dims(void *, void *, void *, void *);
+extern SEXP _textTinyR_INTERSECT(void *, void *);
+extern SEXP _textTinyR_JACCARD(void *, void *);
+extern SEXP _textTinyR_jaccard_dice(void *, void *, void *, void *);
+extern SEXP _textTinyR_keep_idxs(void *, void *);
+extern SEXP _textTinyR_Levenshtein_dist(void *, void *);
+extern SEXP _textTinyR_load_sparse_(void *);
+extern SEXP _textTinyR_Look_up_tbl(void *, void *);
+extern SEXP _textTinyR_modulus(void *, void *);
+extern SEXP _textTinyR_Most_Freq_Terms(void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_Not_Duplicated(void *);
+extern SEXP _textTinyR_NUM_LETTERS_DISTINCT(void *);
+extern SEXP _textTinyR_Path_2vector(void *, void *);
+extern SEXP _textTinyR_RATIO_DISTINCT(void *, void *, void *, void *);
+extern SEXP _textTinyR_read_CHARS(void *, void *, void *);
+extern SEXP _textTinyR_read_ROWS(void *, void *, void *, void *);
+extern SEXP _textTinyR_read_ROWS_wv(void *, void *);
+extern SEXP _textTinyR_reduce_dims_with_correlation(void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_reduced_word_vectors(void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_res_term_matrix(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_res_token(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_res_token_list(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_res_token_vector(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_save_sparse_(void *, void *);
+extern SEXP _textTinyR_sp_means(void *, void *);
+extern SEXP _textTinyR_sp_sums(void *, void *);
+extern SEXP _textTinyR_sparsity_float(void *);
+extern SEXP _textTinyR_sublist(void *, void *);
+extern SEXP _textTinyR_tf_idf_exclude(void *, void *);
+extern SEXP _textTinyR_UNION(void *, void *);
+extern SEXP _textTinyR_UNIQUE(void *);
+extern SEXP _textTinyR_vec_parser(void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_vocabulary_counts(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *);
+extern SEXP _textTinyR_vocabulary_counts_big_tokenize(void *, void *, void *, void *);
+extern SEXP _textTinyR_word_vectors_methods(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *);
static const R_CallMethodDef CallEntries[] = {
{"_textTinyR_Adj_Sparsity", (DL_FUNC) &_textTinyR_Adj_Sparsity, 5},
@@ -125,9 +125,9 @@ static const R_CallMethodDef CallEntries[] = {
{"_textTinyR_res_token_list", (DL_FUNC) &_textTinyR_res_token_list, 31},
{"_textTinyR_res_token_vector", (DL_FUNC) &_textTinyR_res_token_vector, 31},
{"_textTinyR_save_sparse_", (DL_FUNC) &_textTinyR_save_sparse_, 2},
- {"_textTinyR_sparsity_float", (DL_FUNC) &_textTinyR_sparsity_float, 1},
{"_textTinyR_sp_means", (DL_FUNC) &_textTinyR_sp_means, 2},
{"_textTinyR_sp_sums", (DL_FUNC) &_textTinyR_sp_sums, 2},
+ {"_textTinyR_sparsity_float", (DL_FUNC) &_textTinyR_sparsity_float, 1},
{"_textTinyR_sublist", (DL_FUNC) &_textTinyR_sublist, 2},
{"_textTinyR_tf_idf_exclude", (DL_FUNC) &_textTinyR_tf_idf_exclude, 2},
{"_textTinyR_UNION", (DL_FUNC) &_textTinyR_UNION, 2},
diff --git a/src/rcpp_similarities.cpp b/src/rcpp_similarities.cpp
index 7e5d3df..17f2da7 100644
--- a/src/rcpp_similarities.cpp
+++ b/src/rcpp_similarities.cpp
@@ -1,7 +1,6 @@
# include
// [[Rcpp::depends("RcppArmadillo")]]
// [[Rcpp::plugins(openmp)]]
-// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::depends(BH)]]
#include
diff --git a/src/term_matrix.h b/src/term_matrix.h
index 075931a..fd19166 100644
--- a/src/term_matrix.h
+++ b/src/term_matrix.h
@@ -453,7 +453,7 @@ class term_matrix {
if (count + 1 == tmp_print_rows) {
- Rprintf("\rtotal.number.lines.processed.tokenization: %3d", count + 1);
+ Rprintf("\rtotal.number.lines.processed.tokenization: %3lld", count + 1);
tmp_print_rows += print_every_rows;
}
@@ -644,7 +644,7 @@ class term_matrix {
if (count + 1 == tmp_print_rows) {
- Rprintf("\rtotal.number.lines.processed.tokenization: %3d", count + 1);
+ Rprintf("\rtotal.number.lines.processed.tokenization: %3lld", count + 1);
tmp_print_rows += print_every_rows;
}
diff --git a/src/token_big_files.h b/src/token_big_files.h
index 7b2c84a..185af99 100644
--- a/src/token_big_files.h
+++ b/src/token_big_files.h
@@ -859,7 +859,7 @@ class big_files {
if (verbose_print <= tmp_mem || flag_peek) {
- Rprintf("\rtotal.number.lines.processed: %3d", Lines);
+ Rprintf("\rtotal.number.lines.processed: %3lld", Lines);
Rprintf("\tdata.processed.approx.: %.1f %%", tmp_mem);
diff --git a/src/word_vecs_pointer_embedding.cpp b/src/word_vecs_pointer_embedding.cpp
index 497af09..f201150 100644
--- a/src/word_vecs_pointer_embedding.cpp
+++ b/src/word_vecs_pointer_embedding.cpp
@@ -1,7 +1,6 @@
# include
// [[Rcpp::depends("RcppArmadillo")]]
// [[Rcpp::plugins(openmp)]]
-// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::depends(BH)]]
@@ -237,7 +236,7 @@ class PREPROCESS_WORD_VECS {
if (nr_rows + 1 == tmp_print_rows) {
- Rprintf("\rtotal.number.lines.processed.input: %3d", nr_rows + 1);
+ Rprintf("\rtotal.number.lines.processed.input: %3lld", nr_rows + 1);
tmp_print_rows += print_every_rows;
}
@@ -330,7 +329,7 @@ class PREPROCESS_WORD_VECS {
if (sec_nr_rows + 1 == sec_tmp_print_rows) {
- Rprintf("\rtotal.number.lines.processed.output: %3d", sec_nr_rows + 1);
+ Rprintf("\rtotal.number.lines.processed.output: %3lld", sec_nr_rows + 1);
sec_tmp_print_rows += print_every_rows;
}