diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml new file mode 100644 index 0000000..8e6efee --- /dev/null +++ b/.github/workflows/docker-image.yml @@ -0,0 +1,51 @@ + +#.......................................................................................................... +# build, push and cache the docker image. I have to adjust the following in case of a different repository: +# - I have to add the 'BUILD_DATE' arg in the Dockerfile +# - I have to create a DOCKER_PASSWORD (use the docker token) in the 'Settings' tab of the repository +# References: +# - https://github.com/mlampros/IceSat2R/blob/master/.github/workflows/docker_image.yml +# - https://github.com/orgs/community/discussions/25768#discussioncomment-3249184 +#.......................................................................................................... + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: docker_img + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - id: string + uses: ASzc/change-string-case-action@v1 + with: + string: ${{ github.event.repository.name }} + + - name: Check Out Repo + uses: actions/checkout@v2 + + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ github.repository_owner }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Build and push + uses: docker/build-push-action@v2 + with: + context: ./ + build-args: BUILD_DATE="$(date -u +'%Y-%m-%dT%H:%M:%SZ')" + file: ./Dockerfile + builder: ${{ steps.buildx.outputs.name }} + push: true + tags: ${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:rstudiodev + cache-from: type=registry,ref=${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:buildcache + cache-to: type=registry,ref=${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:buildcache,mode=max diff --git a/.github/workflows/issue.yml b/.github/workflows/issue.yml index 179031f..ebb037b 100644 --- a/.github/workflows/issue.yml +++ b/.github/workflows/issue.yml @@ -13,6 +13,6 @@ jobs: with: repo-token: ${{ secrets.GITHUB_TOKEN }} ignore-comments: true - labels-synonyms: '{"bug":["error","need fix","not working"],"enhancement":["upgrade"],"question":["help"]}' - labels-not-allowed: '["good first issue"]' - default-labels: '["help wanted"]' + labels-synonyms: '{"bug":["error","need fix","not working"],"enhancement":["upgrade"],"question":["help","how can i"]}' + labels-not-allowed: '["documentation","duplicate","good first issue","help wanted","invalid"]' + default-labels: '["triage"]' diff --git a/DESCRIPTION b/DESCRIPTION index 4401deb..d8a1abc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: textTinyR Type: Package Title: Text Processing for Small or Big Data Files -Version: 1.1.7 -Date: 2021-10-29 +Version: 1.1.8 +Date: 2023-12-04 Authors@R: c( person(given = "Lampros", family = "Mouselimis", email = "mouselimislampros@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "https://orcid.org/0000-0002-8024-1546"))) BugReports: https://github.com/mlampros/textTinyR/issues URL: https://github.com/mlampros/textTinyR @@ -16,6 +16,6 @@ Imports: Rcpp (>= 0.12.10), R6, data.table, utils LinkingTo: Rcpp, RcppArmadillo (>= 0.7.8), BH Suggests: testthat, covr, knitr, rmarkdown VignetteBuilder: knitr -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.3 NeedsCompilation: yes Packaged: 2017-04-01 13:56:22 UTC; lampros diff --git a/Dockerfile b/Dockerfile index 961db92..6833e36 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,19 @@ FROM rocker/rstudio:devel - LABEL maintainer='Lampros Mouselimis' - RUN export DEBIAN_FRONTEND=noninteractive; apt-get -y update && \ apt-get install -y zlib1g-dev git-core pandoc pandoc-citeproc libcurl4-openssl-dev libssl-dev && \ apt-get install -y sudo && \ apt-get install -y libarmadillo-dev && \ - R -e "install.packages(c( 'Rcpp', 'R6', 'data.table', 'utils', 'RcppArmadillo', 'BH', 'testthat', 'covr', 'knitr', 'rmarkdown', 'remotes' ), repos = 'https://cloud.r-project.org/' )" && \ - R -e "remotes::install_github('mlampros/textTinyR', upgrade = 'always', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" && \ + R -e "install.packages(c( 'Rcpp', 'R6', 'data.table', 'utils', 'RcppArmadillo', 'BH', 'testthat', 'covr', 'knitr', 'rmarkdown', 'remotes' ), repos = 'https://cloud.r-project.org/' )" + +ADD http://www.random.org/strings/?num=10&len=8&digits=on&upperalpha=on&loweralpha=on&unique=on&format=plain&rnd=new uuid +ARG BUILD_DATE + +RUN echo "$BUILD_DATE" +RUN R -e "remotes::install_github('mlampros/textTinyR', upgrade = 'always', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" && \ apt-get autoremove -y && \ apt-get clean - ENV USER rstudio - - diff --git a/NAMESPACE b/NAMESPACE index c2ff613..153d24a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export(Count_Rows) export(Doc2Vec) export(JACCARD_DICE) export(TEXT_DOC_DISSIM) +export(batch_compute) export(big_tokenize_transform) export(bytes_converter) export(cluster_frequency) diff --git a/NEWS.md b/NEWS.md index 19f550f..a60457d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,14 @@ +## textTinyR 1.1.8 + +* I've fixed the CRAN *warning: format specifies type 'int' but the argument has type 'long long'* in the following files & lines by replacing the `%3d` expression with `%3lld`: + * ./token_big_files.h:862:60 + * ./term_matrix.h:456:75 *and* 647:75 + * word_vecs_pointer_embedding.cpp:333:67 *and* 240:68 +* I removed the "CXX_STD = CXX11" from the "Makevars" files, and the "[[Rcpp::plugins(cpp11)]]" from the ".cpp" files due to the following NOTE from CRAN, "NOTE Specified C++11: please drop specification unless essential" (see also: https://www.tidyverse.org/blog/2023/03/cran-checks-compiled-code/#note-regarding-systemrequirements-c11) +* I exported the *batch_calculation()* Rcpp function and created the *batch_compute()* R function +* I removed the `-mthreads` compilation option from the "Makevars.win" file + ## textTinyR 1.1.7 diff --git a/R/utils.R b/R/utils.R index 9ab3b98..c719153 100644 --- a/R/utils.R +++ b/R/utils.R @@ -645,36 +645,31 @@ utf_locale = function(language = "english") { #' #' @usage # utl <- big_tokenize_transform$new(verbose = FALSE) #' @examples +#' +#' \dontrun{ #' #' library(textTinyR) #' -#' -#' # fs <- big_tokenize_transform$new(verbose = FALSE) +#' fs <- big_tokenize_transform$new(verbose = FALSE) #' #' #--------------- #' # file splitter: #' #--------------- #' -#' # fs$big_text_splitter(input_path_file = "input.txt", -#' -#' # output_path_folder = "/folder/output/", -#' -#' # end_query = "endword", batches = 5, -#' -#' # trimmed_line = FALSE) +#' fs$big_text_splitter(input_path_file = "input.txt", +#' output_path_folder = "/folder/output/", +#' end_query = "endword", batches = 5, +#' trimmed_line = FALSE) #' #' #' #------------- #' # file parser: #' #------------- #' -#' # fs$big_text_parser(input_path_folder = "/folder/output/", -#' -#' # output_path_folder = "/folder/parser/", -#' -#' # start_query = "startword", end_query = "endword", -#' -#' # min_lines = 1, trimmed_line = TRUE) +#' fs$big_text_parser(input_path_folder = "/folder/output/", +#' output_path_folder = "/folder/parser/", +#' start_query = "startword", end_query = "endword", +#' min_lines = 1, trimmed_line = TRUE) #' #' #' #---------------- @@ -682,32 +677,24 @@ utf_locale = function(language = "english") { #' #---------------- #' #' -#' # fs$big_text_tokenizer(input_path_folder = "/folder/parser/", -#' -#' # batches = 5, split_string=TRUE, -#' -#' # to_lower = TRUE, trim_token = TRUE, -#' -#' # max_num_char = 100, remove_stopwords = TRUE, -#' -#' # stemmer = "porter2_stemmer", threads = 1, -#' -#' # path_2folder="/folder/output_token/", -#' -#' # vocabulary_path_folder="/folder/VOCAB/") +#' fs$big_text_tokenizer(input_path_folder = "/folder/parser/", +#' batches = 5, split_string=TRUE, +#' to_lower = TRUE, trim_token = TRUE, +#' max_num_char = 100, remove_stopwords = TRUE, +#' stemmer = "porter2_stemmer", threads = 1, +#' path_2folder="/folder/output_token/", +#' vocabulary_path_folder="/folder/VOCAB/") #' #' #------------------- #' # vocabulary counts: #' #------------------- #' #' -#' # fs$vocabulary_accumulator(input_path_folder = "/folder/VOCAB/", -#' -#' # vocabulary_path_file = "/folder/vocab.txt", -#' -#' # max_num_chars = 50) - - +#' fs$vocabulary_accumulator(input_path_folder = "/folder/VOCAB/", +#' vocabulary_path_file = "/folder/vocab.txt", +#' max_num_chars = 50) +#' +#' } big_tokenize_transform <- R6::R6Class("big_tokenize_transform", @@ -1006,16 +993,16 @@ big_tokenize_transform <- R6::R6Class("big_tokenize_transform", #' Stemming of the english language is done using the porter2-stemmer, for details see \url{https://github.com/smassung/porter2_stemmer} #' #' @examples +#' +#' \dontrun{ #' #' library(textTinyR) #' -#' # vps = vocabulary_parser(input_path_file = '/folder/input_data.txt', -#' -#' # start_query = 'start_word', end_query = 'end_word', -#' -#' # vocabulary_path_file = '/folder/vocab.txt', -#' -#' # to_lower = TRUE, split_string = TRUE) +#' vps = vocabulary_parser(input_path_file = '/folder/input_data.txt', +#' start_query = 'start_word', end_query = 'end_word', +#' vocabulary_path_file = '/folder/vocab.txt', +#' to_lower = TRUE, split_string = TRUE) +#' } vocabulary_parser = function(input_path_file = NULL, start_query = NULL, end_query = NULL, vocabulary_path_file = NULL, min_lines = 1, trimmed_line = FALSE, to_lower = FALSE, @@ -1146,10 +1133,14 @@ vocabulary_parser = function(input_path_file = NULL, start_query = NULL, end_que #' @return a number #' @export #' @examples +#' +#' \dontrun{ #' #' library(textTinyR) #' -#' # bc = bytes_converter(input_path_file = 'some_file.txt', unit = "MB") +#' bc = bytes_converter(input_path_file = 'some_file.txt', unit = "MB") +#' +#' } bytes_converter = function(input_path_file = NULL, unit = "MB") { @@ -1178,41 +1169,36 @@ bytes_converter = function(input_path_file = NULL, unit = "MB") { #' The text file should have a structure (such as an xml-structure), so that subsets can be extracted using the \emph{start_query} and \emph{end_query} parameters ( the same applies in case of a vector of character strings) #' @export #' @examples +#' +#' \dontrun{ #' #' library(textTinyR) #' #' # In case that the 'input_path_file' is a valid path #' #--------------------------------------------------- -#' -#' # fp = text_file_parser(input_path_file = '/folder/input_data.txt', -#' -#' # output_path_file = '/folder/output_data.txt', -#' -#' # start_query = 'word_a', end_query = 'word_w', -#' -#' # min_lines = 1, trimmed_line = FALSE) -#' -#' +#' +#' fp = text_file_parser(input_path_file = '/folder/input_data.txt', +#' output_path_file = '/folder/output_data.txt', +#' start_query = 'word_a', end_query = 'word_w', +#' min_lines = 1, trimmed_line = FALSE) +#' +#' #' # In case that the 'input_path_file' is a character vector of strings #' #-------------------------------------------------------------------- -#' -#' # PATH_url = "https://FILE.xml" -#' -#' # con = url(PATH_url, method = "libcurl") -#' -#' # tmp_dat = read.delim(con, quote = "\"", comment.char = "", stringsAsFactors = FALSE) -#' -#' # vec_docs = unlist(lapply(1:length(as.vector(tmp_dat[, 1])), function(x) -#' -#' # trimws(tmp_dat[x, 1], which = "both"))) -#' -#' # parse_data = text_file_parser(input_path_file = vec_docs, -#' -#' # start_query = c("", "", ""), -#' -#' # end_query = c("", "", ""), -#' -#' # min_lines = 1, trimmed_line = TRUE) +#' +#' PATH_url = "https://FILE.xml" +#' con = url(PATH_url, method = "libcurl") +#' tmp_dat = read.delim(con, quote = "\"", comment.char = "", stringsAsFactors = FALSE) +#' +#' vec_docs = unlist(lapply(1:length(as.vector(tmp_dat[, 1])), function(x) +#' trimws(tmp_dat[x, 1], which = "both"))) +#' +#' parse_data = text_file_parser(input_path_file = vec_docs, +#' start_query = c("", "", ""), +#' end_query = c("", "", ""), +#' min_lines = 1, trimmed_line = TRUE) +#' +#' } text_file_parser = function(input_path_file = NULL, output_path_file = "", start_query = NULL, end_query = NULL, min_lines = 1, trimmed_line = FALSE, verbose = FALSE) { @@ -1403,7 +1389,6 @@ text_file_parser = function(input_path_file = NULL, output_path_file = "", start #' # tk$print_words_lookup_tbl(n_gram = 'e_w') - token_stats <- R6::R6Class("token_stats", public = list( @@ -1726,9 +1711,6 @@ cosine_distance = function(sentence1, sentence2, split_separator = " ") { } - - - #' Term matrices and statistics ( document-term-matrix, term-document-matrix) #' #' @@ -1825,62 +1807,59 @@ cosine_distance = function(sentence1, sentence2, split_separator = " ") { #' #' # document_term_matrix = TRUE) #' @examples +#' +#' \dontrun{ #' #' library(textTinyR) #' #' -#' # sm <- sparse_term_matrix$new(file_data = "/folder/my_data.txt", -#' -#' # document_term_matrix = TRUE) -#' +#' sm <- sparse_term_matrix$new(file_data = "/folder/my_data.txt", +#' document_term_matrix = TRUE) +#' #' #-------------- #' # term matrix : #' #-------------- -#' -#' # sm$Term_Matrix(sort_terms = TRUE, to_lower = TRUE, -#' -#' # trim_token = TRUE, split_string = TRUE, -#' -#' # remove_stopwords = TRUE, normalize = 'l1', -#' -#' # stemmer = 'porter2_stemmer', threads = 1 ) -#' +#' +#' sm$Term_Matrix(sort_terms = TRUE, to_lower = TRUE, +#' trim_token = TRUE, split_string = TRUE, +#' remove_stopwords = TRUE, normalize = 'l1', +#' stemmer = 'porter2_stemmer', threads = 1 ) +#' #' #--------------- #' # triplet data : #' #--------------- -#' -#' # sm$triplet_data() -#' -#' +#' +#' sm$triplet_data() +#' +#' #' #---------------------- #' # global-term-weights : #' #---------------------- -#' -#' # sm$global_term_weights() -#' -#' +#' +#' sm$global_term_weights() +#' +#' #' #------------------------- #' # removal of sparse terms: #' #------------------------- -#' -#' # sm$Term_Matrix_Adjust(sparsity_thresh = 0.995) -#' -#' +#' +#' sm$Term_Matrix_Adjust(sparsity_thresh = 0.995) +#' +#' #' #----------------------------------------------- #' # associations between terms of a sparse matrix: #' #----------------------------------------------- -#' -#' -#' # sm$term_associations(Terms = c("word", "sentence"), keep_terms = 10) -#' -#' +#' +#' sm$term_associations(Terms = c("word", "sentence"), keep_terms = 10) +#' +#' #' #--------------------------------------------- #' # most frequent terms using the sparse matrix: #' #--------------------------------------------- -#' -#' -#' # sm$most_frequent_terms(keep_terms = 10, threads = 1) - +#' +#' sm$most_frequent_terms(keep_terms = 10, threads = 1) +#' +#' } sparse_term_matrix <- R6::R6Class("sparse_term_matrix", @@ -2509,8 +2488,6 @@ sparse_Means = function(sparse_matrix, rowMeans = FALSE) { } - - #' sparsity percentage of a sparse matrix #' #' @@ -2536,7 +2513,6 @@ matrix_sparsity = function(sparse_matrix) { } - #' save a sparse matrix in binary format #' #' @@ -2567,7 +2543,6 @@ save_sparse_binary = function(sparse_matrix, file_name = "save_sparse.mat") { } - #' load a sparse matrix in binary format #' #' @@ -2576,9 +2551,13 @@ save_sparse_binary = function(sparse_matrix, file_name = "save_sparse.mat") { #' @export #' @examples #' +#' \dontrun{ +#' #' library(textTinyR) #' -#' # load_sparse_binary(file_name = "save_sparse.mat") +#' load_sparse_binary(file_name = "save_sparse.mat") +#' +#' } load_sparse_binary = function(file_name = "save_sparse.mat") { @@ -2591,8 +2570,6 @@ load_sparse_binary = function(file_name = "save_sparse.mat") { } - - #' read a specific number of characters from a text file #' #' @@ -2601,10 +2578,14 @@ load_sparse_binary = function(file_name = "save_sparse.mat") { #' @param write_2file either an empty string ("") or a character string specifying a valid output file to write the subset of the input file #' @export #' @examples +#' +#' \dontrun{ #' #' library(textTinyR) #' -#' # txfl = read_characters(input_file = 'input.txt', characters = 100) +#' txfl = read_characters(input_file = 'input.txt', characters = 100) +#' +#' } read_characters = function(input_file = NULL, characters = 100, write_2file = "") { @@ -2638,10 +2619,14 @@ read_characters = function(input_file = NULL, characters = 100, write_2file = "" #' @param write_2file either "" or a character string specifying a valid output file to write the subset of the input file #' @export #' @examples +#' +#' \dontrun{ #' #' library(textTinyR) #' -#' # txfl = read_rows(input_file = 'input.txt', rows = 100) +#' txfl = read_rows(input_file = 'input.txt', rows = 100) +#' +#' } read_rows = function(input_file = NULL, read_delimiter = "\n", rows = 100, write_2file = "") { @@ -2665,12 +2650,8 @@ read_rows = function(input_file = NULL, read_delimiter = "\n", rows = 100, write } - #------------------------------------------------------------------------------------------------------------------------------------------------------------ Word-Vector-Utility functions - - - #' dimensions of a word vectors file #' #' @param input_file a character string specifying a valid path to a text file @@ -3325,3 +3306,22 @@ cluster_frequency = function(tokenized_list_text, cluster_vector, verbose = FALS } +#' Compute batches +#' +#' @param n_rows a numeric specifying the number of rows +#' @param n_batches a numeric specifying the number of output batches +#' @return a list +#' @export +#' @examples +#' +#' library(textTinyR) +#' +#' btch = batch_compute(n_rows = 1000, n_batches = 10) + + +batch_compute = function(n_rows, n_batches) { + if (!inherits(n_rows, c('numeric', 'integer'))) stop("the 'n_rows' parameter should be of type either numeric or integer", call. = F) + if (!inherits(n_batches, c('numeric', 'integer'))) stop("the 'n_batches' parameter should be of type either numeric or integer", call. = F) + + return(batch_calculation(n_rows, n_batches)) +} diff --git a/README.md b/README.md index 5e7816a..efcaa8b 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ chmod -R 777 /home/YOUR_DIR
-The **USER** defaults to *rstudio* but you have to give your **PASSWORD** of preference (see [www.rocker-project.org](https://www.rocker-project.org/) for more information). +The **USER** defaults to *rstudio* but you have to give your **PASSWORD** of preference (see [https://rocker-project.org/](https://rocker-project.org/) for more information).
@@ -144,7 +144,7 @@ If you use the code of this repository in your paper or research please cite bot title = {{textTinyR}: Text Processing for Small or Big Data Files}, author = {Lampros Mouselimis}, year = {2021}, - note = {R package version 1.1.7}, + note = {R package version 1.1.8}, url = {https://CRAN.R-project.org/package=textTinyR}, } ``` diff --git a/man/Doc2Vec.Rd b/man/Doc2Vec.Rd index 8904a4c..db48ec5 100644 --- a/man/Doc2Vec.Rd +++ b/man/Doc2Vec.Rd @@ -76,15 +76,15 @@ out = init$doc2vec_methods(method = "sum_sqrt") \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-new}{\code{Doc2Vec$new()}} -\item \href{#method-doc2vec_methods}{\code{Doc2Vec$doc2vec_methods()}} -\item \href{#method-pre_processed_wv}{\code{Doc2Vec$pre_processed_wv()}} -\item \href{#method-clone}{\code{Doc2Vec$clone()}} +\item \href{#method-documents_to_wordvectors-new}{\code{Doc2Vec$new()}} +\item \href{#method-documents_to_wordvectors-doc2vec_methods}{\code{Doc2Vec$doc2vec_methods()}} +\item \href{#method-documents_to_wordvectors-pre_processed_wv}{\code{Doc2Vec$pre_processed_wv()}} +\item \href{#method-documents_to_wordvectors-clone}{\code{Doc2Vec$clone()}} } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-new}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-documents_to_wordvectors-new}{}}} \subsection{Method \code{new()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Doc2Vec$new( @@ -113,8 +113,8 @@ out = init$doc2vec_methods(method = "sum_sqrt") } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-doc2vec_methods}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-documents_to_wordvectors-doc2vec_methods}{}}} \subsection{Method \code{doc2vec_methods()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Doc2Vec$doc2vec_methods( @@ -137,8 +137,8 @@ out = init$doc2vec_methods(method = "sum_sqrt") } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-pre_processed_wv}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-documents_to_wordvectors-pre_processed_wv}{}}} \subsection{Method \code{pre_processed_wv()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Doc2Vec$pre_processed_wv()}\if{html}{\out{
}} @@ -146,8 +146,8 @@ out = init$doc2vec_methods(method = "sum_sqrt") } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-clone}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-documents_to_wordvectors-clone}{}}} \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ diff --git a/man/batch_compute.Rd b/man/batch_compute.Rd new file mode 100644 index 0000000..bd5e72a --- /dev/null +++ b/man/batch_compute.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{batch_compute} +\alias{batch_compute} +\title{Compute batches} +\usage{ +batch_compute(n_rows, n_batches) +} +\arguments{ +\item{n_rows}{a numeric specifying the number of rows} + +\item{n_batches}{a numeric specifying the number of output batches} +} +\value{ +a list +} +\description{ +Compute batches +} +\examples{ + +library(textTinyR) + +btch = batch_compute(n_rows = 1000, n_batches = 10) +} diff --git a/man/big_tokenize_transform.Rd b/man/big_tokenize_transform.Rd index fa374a3..fa5edf5 100644 --- a/man/big_tokenize_transform.Rd +++ b/man/big_tokenize_transform.Rd @@ -49,35 +49,30 @@ The \emph{ngram_sequential} or \emph{ngram_overlap} stemming method applies to e \examples{ -library(textTinyR) +\dontrun{ +library(textTinyR) -# fs <- big_tokenize_transform$new(verbose = FALSE) +fs <- big_tokenize_transform$new(verbose = FALSE) #--------------- # file splitter: #--------------- -# fs$big_text_splitter(input_path_file = "input.txt", - -# output_path_folder = "/folder/output/", - -# end_query = "endword", batches = 5, - -# trimmed_line = FALSE) +fs$big_text_splitter(input_path_file = "input.txt", + output_path_folder = "/folder/output/", + end_query = "endword", batches = 5, + trimmed_line = FALSE) #------------- # file parser: #------------- -# fs$big_text_parser(input_path_folder = "/folder/output/", - -# output_path_folder = "/folder/parser/", - -# start_query = "startword", end_query = "endword", - -# min_lines = 1, trimmed_line = TRUE) +fs$big_text_parser(input_path_folder = "/folder/output/", + output_path_folder = "/folder/parser/", + start_query = "startword", end_query = "endword", + min_lines = 1, trimmed_line = TRUE) #---------------- @@ -85,45 +80,39 @@ library(textTinyR) #---------------- -# fs$big_text_tokenizer(input_path_folder = "/folder/parser/", - -# batches = 5, split_string=TRUE, - -# to_lower = TRUE, trim_token = TRUE, - -# max_num_char = 100, remove_stopwords = TRUE, - -# stemmer = "porter2_stemmer", threads = 1, - -# path_2folder="/folder/output_token/", - -# vocabulary_path_folder="/folder/VOCAB/") + fs$big_text_tokenizer(input_path_folder = "/folder/parser/", + batches = 5, split_string=TRUE, + to_lower = TRUE, trim_token = TRUE, + max_num_char = 100, remove_stopwords = TRUE, + stemmer = "porter2_stemmer", threads = 1, + path_2folder="/folder/output_token/", + vocabulary_path_folder="/folder/VOCAB/") #------------------- # vocabulary counts: #------------------- -# fs$vocabulary_accumulator(input_path_folder = "/folder/VOCAB/", +fs$vocabulary_accumulator(input_path_folder = "/folder/VOCAB/", + vocabulary_path_file = "/folder/vocab.txt", + max_num_chars = 50) -# vocabulary_path_file = "/folder/vocab.txt", - -# max_num_chars = 50) +} } \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-new}{\code{big_tokenize_transform$new()}} -\item \href{#method-big_text_splitter}{\code{big_tokenize_transform$big_text_splitter()}} -\item \href{#method-big_text_parser}{\code{big_tokenize_transform$big_text_parser()}} -\item \href{#method-big_text_tokenizer}{\code{big_tokenize_transform$big_text_tokenizer()}} -\item \href{#method-vocabulary_accumulator}{\code{big_tokenize_transform$vocabulary_accumulator()}} -\item \href{#method-clone}{\code{big_tokenize_transform$clone()}} +\item \href{#method-big_tokenize_transform-new}{\code{big_tokenize_transform$new()}} +\item \href{#method-big_tokenize_transform-big_text_splitter}{\code{big_tokenize_transform$big_text_splitter()}} +\item \href{#method-big_tokenize_transform-big_text_parser}{\code{big_tokenize_transform$big_text_parser()}} +\item \href{#method-big_tokenize_transform-big_text_tokenizer}{\code{big_tokenize_transform$big_text_tokenizer()}} +\item \href{#method-big_tokenize_transform-vocabulary_accumulator}{\code{big_tokenize_transform$vocabulary_accumulator()}} +\item \href{#method-big_tokenize_transform-clone}{\code{big_tokenize_transform$clone()}} } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-new}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-big_tokenize_transform-new}{}}} \subsection{Method \code{new()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{big_tokenize_transform$new(verbose = FALSE)}\if{html}{\out{
}} @@ -138,8 +127,8 @@ library(textTinyR) } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-big_text_splitter}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-big_tokenize_transform-big_text_splitter}{}}} \subsection{Method \code{big_text_splitter()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{big_tokenize_transform$big_text_splitter( @@ -168,8 +157,8 @@ library(textTinyR) } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-big_text_parser}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-big_tokenize_transform-big_text_parser}{}}} \subsection{Method \code{big_text_parser()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{big_tokenize_transform$big_text_parser( @@ -201,8 +190,8 @@ library(textTinyR) } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-big_text_tokenizer}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-big_tokenize_transform-big_text_tokenizer}{}}} \subsection{Method \code{big_text_tokenizer()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{big_tokenize_transform$big_text_tokenizer( @@ -321,8 +310,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-vocabulary_accumulator}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-big_tokenize_transform-vocabulary_accumulator}{}}} \subsection{Method \code{vocabulary_accumulator()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{big_tokenize_transform$vocabulary_accumulator( @@ -345,8 +334,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-clone}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-big_tokenize_transform-clone}{}}} \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ diff --git a/man/bytes_converter.Rd b/man/bytes_converter.Rd index c315c03..310da9f 100644 --- a/man/bytes_converter.Rd +++ b/man/bytes_converter.Rd @@ -19,7 +19,11 @@ bytes converter of a text file ( KB, MB or GB ) } \examples{ +\dontrun{ + library(textTinyR) -# bc = bytes_converter(input_path_file = 'some_file.txt', unit = "MB") +bc = bytes_converter(input_path_file = 'some_file.txt', unit = "MB") + +} } diff --git a/man/load_sparse_binary.Rd b/man/load_sparse_binary.Rd index d50a70a..cb40884 100644 --- a/man/load_sparse_binary.Rd +++ b/man/load_sparse_binary.Rd @@ -17,7 +17,11 @@ load a sparse matrix in binary format } \examples{ +\dontrun{ + library(textTinyR) -# load_sparse_binary(file_name = "save_sparse.mat") +load_sparse_binary(file_name = "save_sparse.mat") + +} } diff --git a/man/read_characters.Rd b/man/read_characters.Rd index e14c773..4928381 100644 --- a/man/read_characters.Rd +++ b/man/read_characters.Rd @@ -18,7 +18,11 @@ read a specific number of characters from a text file } \examples{ +\dontrun{ + library(textTinyR) -# txfl = read_characters(input_file = 'input.txt', characters = 100) +txfl = read_characters(input_file = 'input.txt', characters = 100) + +} } diff --git a/man/read_rows.Rd b/man/read_rows.Rd index 65ad6af..8bed41e 100644 --- a/man/read_rows.Rd +++ b/man/read_rows.Rd @@ -25,7 +25,11 @@ read a specific number of rows from a text file } \examples{ +\dontrun{ + library(textTinyR) -# txfl = read_rows(input_file = 'input.txt', rows = 100) +txfl = read_rows(input_file = 'input.txt', rows = 100) + +} } diff --git a/man/sparse_term_matrix.Rd b/man/sparse_term_matrix.Rd index c9bb187..7e2cfcf 100644 --- a/man/sparse_term_matrix.Rd +++ b/man/sparse_term_matrix.Rd @@ -63,77 +63,75 @@ Stemming of the english language is done using the porter2-stemmer, for details \examples{ -library(textTinyR) +\dontrun{ +library(textTinyR) -# sm <- sparse_term_matrix$new(file_data = "/folder/my_data.txt", -# document_term_matrix = TRUE) +sm <- sparse_term_matrix$new(file_data = "/folder/my_data.txt", + document_term_matrix = TRUE) #-------------- # term matrix : #-------------- -# sm$Term_Matrix(sort_terms = TRUE, to_lower = TRUE, - -# trim_token = TRUE, split_string = TRUE, - -# remove_stopwords = TRUE, normalize = 'l1', - -# stemmer = 'porter2_stemmer', threads = 1 ) +sm$Term_Matrix(sort_terms = TRUE, to_lower = TRUE, + trim_token = TRUE, split_string = TRUE, + remove_stopwords = TRUE, normalize = 'l1', + stemmer = 'porter2_stemmer', threads = 1 ) #--------------- # triplet data : #--------------- -# sm$triplet_data() +sm$triplet_data() #---------------------- # global-term-weights : #---------------------- -# sm$global_term_weights() +sm$global_term_weights() #------------------------- # removal of sparse terms: #------------------------- -# sm$Term_Matrix_Adjust(sparsity_thresh = 0.995) +sm$Term_Matrix_Adjust(sparsity_thresh = 0.995) #----------------------------------------------- # associations between terms of a sparse matrix: #----------------------------------------------- - -# sm$term_associations(Terms = c("word", "sentence"), keep_terms = 10) +sm$term_associations(Terms = c("word", "sentence"), keep_terms = 10) #--------------------------------------------- # most frequent terms using the sparse matrix: #--------------------------------------------- +sm$most_frequent_terms(keep_terms = 10, threads = 1) -# sm$most_frequent_terms(keep_terms = 10, threads = 1) +} } \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-new}{\code{sparse_term_matrix$new()}} -\item \href{#method-Term_Matrix}{\code{sparse_term_matrix$Term_Matrix()}} -\item \href{#method-triplet_data}{\code{sparse_term_matrix$triplet_data()}} -\item \href{#method-global_term_weights}{\code{sparse_term_matrix$global_term_weights()}} -\item \href{#method-Term_Matrix_Adjust}{\code{sparse_term_matrix$Term_Matrix_Adjust()}} -\item \href{#method-term_associations}{\code{sparse_term_matrix$term_associations()}} -\item \href{#method-most_frequent_terms}{\code{sparse_term_matrix$most_frequent_terms()}} -\item \href{#method-clone}{\code{sparse_term_matrix$clone()}} +\item \href{#method-sparse_term_matrix-new}{\code{sparse_term_matrix$new()}} +\item \href{#method-sparse_term_matrix-Term_Matrix}{\code{sparse_term_matrix$Term_Matrix()}} +\item \href{#method-sparse_term_matrix-triplet_data}{\code{sparse_term_matrix$triplet_data()}} +\item \href{#method-sparse_term_matrix-global_term_weights}{\code{sparse_term_matrix$global_term_weights()}} +\item \href{#method-sparse_term_matrix-Term_Matrix_Adjust}{\code{sparse_term_matrix$Term_Matrix_Adjust()}} +\item \href{#method-sparse_term_matrix-term_associations}{\code{sparse_term_matrix$term_associations()}} +\item \href{#method-sparse_term_matrix-most_frequent_terms}{\code{sparse_term_matrix$most_frequent_terms()}} +\item \href{#method-sparse_term_matrix-clone}{\code{sparse_term_matrix$clone()}} } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-new}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-sparse_term_matrix-new}{}}} \subsection{Method \code{new()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{sparse_term_matrix$new( @@ -156,8 +154,8 @@ library(textTinyR) } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Term_Matrix}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-sparse_term_matrix-Term_Matrix}{}}} \subsection{Method \code{Term_Matrix()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{sparse_term_matrix$Term_Matrix( @@ -255,8 +253,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-triplet_data}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-sparse_term_matrix-triplet_data}{}}} \subsection{Method \code{triplet_data()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{sparse_term_matrix$triplet_data()}\if{html}{\out{
}} @@ -264,8 +262,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-global_term_weights}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-sparse_term_matrix-global_term_weights}{}}} \subsection{Method \code{global_term_weights()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{sparse_term_matrix$global_term_weights()}\if{html}{\out{
}} @@ -273,8 +271,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Term_Matrix_Adjust}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-sparse_term_matrix-Term_Matrix_Adjust}{}}} \subsection{Method \code{Term_Matrix_Adjust()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{sparse_term_matrix$Term_Matrix_Adjust(sparsity_thresh = 1)}\if{html}{\out{
}} @@ -289,8 +287,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-term_associations}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-sparse_term_matrix-term_associations}{}}} \subsection{Method \code{term_associations()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{sparse_term_matrix$term_associations( @@ -313,8 +311,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-most_frequent_terms}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-sparse_term_matrix-most_frequent_terms}{}}} \subsection{Method \code{most_frequent_terms()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{sparse_term_matrix$most_frequent_terms( @@ -337,8 +335,8 @@ are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{benga } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-clone}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-sparse_term_matrix-clone}{}}} \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ diff --git a/man/text_file_parser.Rd b/man/text_file_parser.Rd index 5237dd6..2bd9535 100644 --- a/man/text_file_parser.Rd +++ b/man/text_file_parser.Rd @@ -37,38 +37,33 @@ The text file should have a structure (such as an xml-structure), so that subset } \examples{ +\dontrun{ + library(textTinyR) # In case that the 'input_path_file' is a valid path #--------------------------------------------------- -# fp = text_file_parser(input_path_file = '/folder/input_data.txt', - -# output_path_file = '/folder/output_data.txt', - -# start_query = 'word_a', end_query = 'word_w', - -# min_lines = 1, trimmed_line = FALSE) +fp = text_file_parser(input_path_file = '/folder/input_data.txt', + output_path_file = '/folder/output_data.txt', + start_query = 'word_a', end_query = 'word_w', + min_lines = 1, trimmed_line = FALSE) # In case that the 'input_path_file' is a character vector of strings #-------------------------------------------------------------------- -# PATH_url = "https://FILE.xml" - -# con = url(PATH_url, method = "libcurl") - -# tmp_dat = read.delim(con, quote = "\"", comment.char = "", stringsAsFactors = FALSE) +PATH_url = "https://FILE.xml" +con = url(PATH_url, method = "libcurl") +tmp_dat = read.delim(con, quote = "\"", comment.char = "", stringsAsFactors = FALSE) -# vec_docs = unlist(lapply(1:length(as.vector(tmp_dat[, 1])), function(x) +vec_docs = unlist(lapply(1:length(as.vector(tmp_dat[, 1])), function(x) + trimws(tmp_dat[x, 1], which = "both"))) -# trimws(tmp_dat[x, 1], which = "both"))) - -# parse_data = text_file_parser(input_path_file = vec_docs, - -# start_query = c("", "", ""), - -# end_query = c("", "", ""), - -# min_lines = 1, trimmed_line = TRUE) +parse_data = text_file_parser(input_path_file = vec_docs, + start_query = c("", "", ""), + end_query = c("", "", ""), + min_lines = 1, trimmed_line = TRUE) + +} } diff --git a/man/text_intersect.Rd b/man/text_intersect.Rd index fc853fd..a79c84c 100644 --- a/man/text_intersect.Rd +++ b/man/text_intersect.Rd @@ -61,15 +61,15 @@ https://www.kaggle.com/c/home-depot-product-search-relevance/discussion/20427 by \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-new}{\code{text_intersect$new()}} -\item \href{#method-count_intersect}{\code{text_intersect$count_intersect()}} -\item \href{#method-ratio_intersect}{\code{text_intersect$ratio_intersect()}} -\item \href{#method-clone}{\code{text_intersect$clone()}} +\item \href{#method-text_intersect-new}{\code{text_intersect$new()}} +\item \href{#method-text_intersect-count_intersect}{\code{text_intersect$count_intersect()}} +\item \href{#method-text_intersect-ratio_intersect}{\code{text_intersect$ratio_intersect()}} +\item \href{#method-text_intersect-clone}{\code{text_intersect$clone()}} } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-new}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-text_intersect-new}{}}} \subsection{Method \code{new()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{text_intersect$new(token_list1 = NULL, token_list2 = NULL)}\if{html}{\out{
}} @@ -86,8 +86,8 @@ https://www.kaggle.com/c/home-depot-product-search-relevance/discussion/20427 by } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-count_intersect}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-text_intersect-count_intersect}{}}} \subsection{Method \code{count_intersect()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{text_intersect$count_intersect(distinct = FALSE, letters = FALSE)}\if{html}{\out{
}} @@ -104,8 +104,8 @@ https://www.kaggle.com/c/home-depot-product-search-relevance/discussion/20427 by } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ratio_intersect}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-text_intersect-ratio_intersect}{}}} \subsection{Method \code{ratio_intersect()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{text_intersect$ratio_intersect(distinct = FALSE, letters = FALSE)}\if{html}{\out{
}} @@ -122,8 +122,8 @@ https://www.kaggle.com/c/home-depot-product-search-relevance/discussion/20427 by } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-clone}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-text_intersect-clone}{}}} \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ diff --git a/man/token_stats.Rd b/man/token_stats.Rd index 025d957..551a58c 100644 --- a/man/token_stats.Rd +++ b/man/token_stats.Rd @@ -129,23 +129,23 @@ lut <- tk$look_up_table(n_grams = 3) \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-new}{\code{token_stats$new()}} -\item \href{#method-path_2vector}{\code{token_stats$path_2vector()}} -\item \href{#method-freq_distribution}{\code{token_stats$freq_distribution()}} -\item \href{#method-print_frequency}{\code{token_stats$print_frequency()}} -\item \href{#method-count_character}{\code{token_stats$count_character()}} -\item \href{#method-print_count_character}{\code{token_stats$print_count_character()}} -\item \href{#method-collocation_words}{\code{token_stats$collocation_words()}} -\item \href{#method-print_collocations}{\code{token_stats$print_collocations()}} -\item \href{#method-string_dissimilarity_matrix}{\code{token_stats$string_dissimilarity_matrix()}} -\item \href{#method-look_up_table}{\code{token_stats$look_up_table()}} -\item \href{#method-print_words_lookup_tbl}{\code{token_stats$print_words_lookup_tbl()}} -\item \href{#method-clone}{\code{token_stats$clone()}} +\item \href{#method-token_stats-new}{\code{token_stats$new()}} +\item \href{#method-token_stats-path_2vector}{\code{token_stats$path_2vector()}} +\item \href{#method-token_stats-freq_distribution}{\code{token_stats$freq_distribution()}} +\item \href{#method-token_stats-print_frequency}{\code{token_stats$print_frequency()}} +\item \href{#method-token_stats-count_character}{\code{token_stats$count_character()}} +\item \href{#method-token_stats-print_count_character}{\code{token_stats$print_count_character()}} +\item \href{#method-token_stats-collocation_words}{\code{token_stats$collocation_words()}} +\item \href{#method-token_stats-print_collocations}{\code{token_stats$print_collocations()}} +\item \href{#method-token_stats-string_dissimilarity_matrix}{\code{token_stats$string_dissimilarity_matrix()}} +\item \href{#method-token_stats-look_up_table}{\code{token_stats$look_up_table()}} +\item \href{#method-token_stats-print_words_lookup_tbl}{\code{token_stats$print_words_lookup_tbl()}} +\item \href{#method-token_stats-clone}{\code{token_stats$clone()}} } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-new}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-new}{}}} \subsection{Method \code{new()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{token_stats$new( @@ -174,8 +174,8 @@ lut <- tk$look_up_table(n_grams = 3) } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-path_2vector}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-path_2vector}{}}} \subsection{Method \code{path_2vector()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{token_stats$path_2vector()}\if{html}{\out{
}} @@ -183,8 +183,8 @@ lut <- tk$look_up_table(n_grams = 3) } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-freq_distribution}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-freq_distribution}{}}} \subsection{Method \code{freq_distribution()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{token_stats$freq_distribution()}\if{html}{\out{
}} @@ -192,8 +192,8 @@ lut <- tk$look_up_table(n_grams = 3) } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-print_frequency}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-print_frequency}{}}} \subsection{Method \code{print_frequency()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{token_stats$print_frequency(subset = NULL)}\if{html}{\out{
}} @@ -208,8 +208,8 @@ lut <- tk$look_up_table(n_grams = 3) } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-count_character}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-count_character}{}}} \subsection{Method \code{count_character()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{token_stats$count_character()}\if{html}{\out{
}} @@ -217,8 +217,8 @@ lut <- tk$look_up_table(n_grams = 3) } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-print_count_character}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-print_count_character}{}}} \subsection{Method \code{print_count_character()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{token_stats$print_count_character(number = NULL)}\if{html}{\out{
}} @@ -233,8 +233,8 @@ lut <- tk$look_up_table(n_grams = 3) } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-collocation_words}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-collocation_words}{}}} \subsection{Method \code{collocation_words()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{token_stats$collocation_words()}\if{html}{\out{
}} @@ -242,8 +242,8 @@ lut <- tk$look_up_table(n_grams = 3) } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-print_collocations}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-print_collocations}{}}} \subsection{Method \code{print_collocations()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{token_stats$print_collocations(word = NULL)}\if{html}{\out{
}} @@ -258,8 +258,8 @@ lut <- tk$look_up_table(n_grams = 3) } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-string_dissimilarity_matrix}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-string_dissimilarity_matrix}{}}} \subsection{Method \code{string_dissimilarity_matrix()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{token_stats$string_dissimilarity_matrix( @@ -294,8 +294,8 @@ lut <- tk$look_up_table(n_grams = 3) } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-look_up_table}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-look_up_table}{}}} \subsection{Method \code{look_up_table()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{token_stats$look_up_table(n_grams = NULL)}\if{html}{\out{
}} @@ -310,8 +310,8 @@ lut <- tk$look_up_table(n_grams = 3) } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-print_words_lookup_tbl}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-print_words_lookup_tbl}{}}} \subsection{Method \code{print_words_lookup_tbl()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{token_stats$print_words_lookup_tbl(n_gram = NULL)}\if{html}{\out{
}} @@ -326,8 +326,8 @@ lut <- tk$look_up_table(n_grams = 3) } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-clone}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-token_stats-clone}{}}} \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ diff --git a/man/vocabulary_parser.Rd b/man/vocabulary_parser.Rd index 23f1133..160a4bf 100644 --- a/man/vocabulary_parser.Rd +++ b/man/vocabulary_parser.Rd @@ -110,13 +110,13 @@ Stemming of the english language is done using the porter2-stemmer, for details } \examples{ -library(textTinyR) - -# vps = vocabulary_parser(input_path_file = '/folder/input_data.txt', - -# start_query = 'start_word', end_query = 'end_word', +\dontrun{ -# vocabulary_path_file = '/folder/vocab.txt', +library(textTinyR) -# to_lower = TRUE, split_string = TRUE) + vps = vocabulary_parser(input_path_file = '/folder/input_data.txt', + start_query = 'start_word', end_query = 'end_word', + vocabulary_path_file = '/folder/vocab.txt', + to_lower = TRUE, split_string = TRUE) +} } diff --git a/src/Makevars b/src/Makevars index 9ec610c..0dd6477 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,5 +1,4 @@ PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS) -CXX_STD = CXX11 PKG_CPPFLAGS = -I../inst/include/ diff --git a/src/Makevars.win b/src/Makevars.win index bfe6a5f..ca18a35 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,4 +1,3 @@ -PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD -PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS) -mthreads -CXX_STD = CXX11 -PKG_CPPFLAGS = -I../inst/include/ +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS) +PKG_CPPFLAGS = -I../inst/include/ diff --git a/src/export_all_funcs.cpp b/src/export_all_funcs.cpp index d6c076f..9790010 100644 --- a/src/export_all_funcs.cpp +++ b/src/export_all_funcs.cpp @@ -1,7 +1,6 @@ # include // [[Rcpp::depends("RcppArmadillo")]] // [[Rcpp::plugins(openmp)]] -// [[Rcpp::plugins(cpp11)]] // [[Rcpp::depends(BH)]] diff --git a/src/init.c b/src/init.c index 5a45cb8..53cae54 100644 --- a/src/init.c +++ b/src/init.c @@ -8,69 +8,69 @@ */ /* .Call calls */ -extern SEXP _textTinyR_Adj_Sparsity(SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_append_data(SEXP, SEXP); -extern SEXP _textTinyR_Associations_Cpp(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_batch_2file(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_batch_calculation(SEXP, SEXP); -extern SEXP _textTinyR_big_parser(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_big_splitter_bytes(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_big_tokenize(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_Collocations_ngrams(SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_convert_bytes(SEXP, SEXP); -extern SEXP _textTinyR_COR_MATR(SEXP, SEXP, SEXP); -extern SEXP _textTinyR_COS(SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_cosine_dist(SEXP, SEXP, SEXP); -extern SEXP _textTinyR_Cosine_dist(SEXP, SEXP, SEXP); -extern SEXP _textTinyR_Count_characters(SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_count_rows(SEXP, SEXP); -extern SEXP _textTinyR_COUNTS_INTERSECT(SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_dense_2sparse_mat(SEXP); -extern SEXP _textTinyR_DICE(SEXP, SEXP); -extern SEXP _textTinyR_Dice_similarity(SEXP, SEXP, SEXP); -extern SEXP _textTinyR_Dissimilarity_mat(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_DIST(SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_DISTINCT_WORD_INTERSECT(SEXP, SEXP); -extern SEXP _textTinyR_file_parser(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_Frequency_distribution(SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_idf_global_term_weights(SEXP, SEXP); -extern SEXP _textTinyR_inner_cm(SEXP, SEXP, SEXP); -extern SEXP _textTinyR_inner_jd(SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_inner_reduce_dims(SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_INTERSECT(SEXP, SEXP); -extern SEXP _textTinyR_JACCARD(SEXP, SEXP); -extern SEXP _textTinyR_jaccard_dice(SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_keep_idxs(SEXP, SEXP); -extern SEXP _textTinyR_Levenshtein_dist(SEXP, SEXP); -extern SEXP _textTinyR_load_sparse_(SEXP); -extern SEXP _textTinyR_Look_up_tbl(SEXP, SEXP); -extern SEXP _textTinyR_modulus(SEXP, SEXP); -extern SEXP _textTinyR_Most_Freq_Terms(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_Not_Duplicated(SEXP); -extern SEXP _textTinyR_NUM_LETTERS_DISTINCT(SEXP); -extern SEXP _textTinyR_Path_2vector(SEXP, SEXP); -extern SEXP _textTinyR_RATIO_DISTINCT(SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_read_CHARS(SEXP, SEXP, SEXP); -extern SEXP _textTinyR_read_ROWS(SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_read_ROWS_wv(SEXP, SEXP); -extern SEXP _textTinyR_reduce_dims_with_correlation(SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_reduced_word_vectors(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_res_term_matrix(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_res_token(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_res_token_list(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_res_token_vector(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_save_sparse_(SEXP, SEXP); -extern SEXP _textTinyR_sparsity_float(SEXP); -extern SEXP _textTinyR_sp_means(SEXP, SEXP); -extern SEXP _textTinyR_sp_sums(SEXP, SEXP); -extern SEXP _textTinyR_sublist(SEXP, SEXP); -extern SEXP _textTinyR_tf_idf_exclude(SEXP, SEXP); -extern SEXP _textTinyR_UNION(SEXP, SEXP); -extern SEXP _textTinyR_UNIQUE(SEXP); -extern SEXP _textTinyR_vec_parser(SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_vocabulary_counts(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_vocabulary_counts_big_tokenize(SEXP, SEXP, SEXP, SEXP); -extern SEXP _textTinyR_word_vectors_methods(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _textTinyR_Adj_Sparsity(void *, void *, void *, void *, void *); +extern SEXP _textTinyR_append_data(void *, void *); +extern SEXP _textTinyR_Associations_Cpp(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_batch_2file(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_batch_calculation(void *, void *); +extern SEXP _textTinyR_big_parser(void *, void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_big_splitter_bytes(void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_big_tokenize(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_Collocations_ngrams(void *, void *, void *, void *, void *); +extern SEXP _textTinyR_convert_bytes(void *, void *); +extern SEXP _textTinyR_COR_MATR(void *, void *, void *); +extern SEXP _textTinyR_COS(void *, void *, void *, void *); +extern SEXP _textTinyR_cosine_dist(void *, void *, void *); +extern SEXP _textTinyR_Cosine_dist(void *, void *, void *); +extern SEXP _textTinyR_Count_characters(void *, void *, void *, void *); +extern SEXP _textTinyR_count_rows(void *, void *); +extern SEXP _textTinyR_COUNTS_INTERSECT(void *, void *, void *, void *); +extern SEXP _textTinyR_dense_2sparse_mat(void *); +extern SEXP _textTinyR_DICE(void *, void *); +extern SEXP _textTinyR_Dice_similarity(void *, void *, void *); +extern SEXP _textTinyR_Dissimilarity_mat(void *, void *, void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_DIST(void *, void *, void *, void *, void *); +extern SEXP _textTinyR_DISTINCT_WORD_INTERSECT(void *, void *); +extern SEXP _textTinyR_file_parser(void *, void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_Frequency_distribution(void *, void *, void *, void *); +extern SEXP _textTinyR_idf_global_term_weights(void *, void *); +extern SEXP _textTinyR_inner_cm(void *, void *, void *); +extern SEXP _textTinyR_inner_jd(void *, void *, void *, void *); +extern SEXP _textTinyR_inner_reduce_dims(void *, void *, void *, void *); +extern SEXP _textTinyR_INTERSECT(void *, void *); +extern SEXP _textTinyR_JACCARD(void *, void *); +extern SEXP _textTinyR_jaccard_dice(void *, void *, void *, void *); +extern SEXP _textTinyR_keep_idxs(void *, void *); +extern SEXP _textTinyR_Levenshtein_dist(void *, void *); +extern SEXP _textTinyR_load_sparse_(void *); +extern SEXP _textTinyR_Look_up_tbl(void *, void *); +extern SEXP _textTinyR_modulus(void *, void *); +extern SEXP _textTinyR_Most_Freq_Terms(void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_Not_Duplicated(void *); +extern SEXP _textTinyR_NUM_LETTERS_DISTINCT(void *); +extern SEXP _textTinyR_Path_2vector(void *, void *); +extern SEXP _textTinyR_RATIO_DISTINCT(void *, void *, void *, void *); +extern SEXP _textTinyR_read_CHARS(void *, void *, void *); +extern SEXP _textTinyR_read_ROWS(void *, void *, void *, void *); +extern SEXP _textTinyR_read_ROWS_wv(void *, void *); +extern SEXP _textTinyR_reduce_dims_with_correlation(void *, void *, void *, void *, void *); +extern SEXP _textTinyR_reduced_word_vectors(void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_res_term_matrix(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_res_token(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_res_token_list(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_res_token_vector(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_save_sparse_(void *, void *); +extern SEXP _textTinyR_sp_means(void *, void *); +extern SEXP _textTinyR_sp_sums(void *, void *); +extern SEXP _textTinyR_sparsity_float(void *); +extern SEXP _textTinyR_sublist(void *, void *); +extern SEXP _textTinyR_tf_idf_exclude(void *, void *); +extern SEXP _textTinyR_UNION(void *, void *); +extern SEXP _textTinyR_UNIQUE(void *); +extern SEXP _textTinyR_vec_parser(void *, void *, void *, void *, void *); +extern SEXP _textTinyR_vocabulary_counts(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); +extern SEXP _textTinyR_vocabulary_counts_big_tokenize(void *, void *, void *, void *); +extern SEXP _textTinyR_word_vectors_methods(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); static const R_CallMethodDef CallEntries[] = { {"_textTinyR_Adj_Sparsity", (DL_FUNC) &_textTinyR_Adj_Sparsity, 5}, @@ -125,9 +125,9 @@ static const R_CallMethodDef CallEntries[] = { {"_textTinyR_res_token_list", (DL_FUNC) &_textTinyR_res_token_list, 31}, {"_textTinyR_res_token_vector", (DL_FUNC) &_textTinyR_res_token_vector, 31}, {"_textTinyR_save_sparse_", (DL_FUNC) &_textTinyR_save_sparse_, 2}, - {"_textTinyR_sparsity_float", (DL_FUNC) &_textTinyR_sparsity_float, 1}, {"_textTinyR_sp_means", (DL_FUNC) &_textTinyR_sp_means, 2}, {"_textTinyR_sp_sums", (DL_FUNC) &_textTinyR_sp_sums, 2}, + {"_textTinyR_sparsity_float", (DL_FUNC) &_textTinyR_sparsity_float, 1}, {"_textTinyR_sublist", (DL_FUNC) &_textTinyR_sublist, 2}, {"_textTinyR_tf_idf_exclude", (DL_FUNC) &_textTinyR_tf_idf_exclude, 2}, {"_textTinyR_UNION", (DL_FUNC) &_textTinyR_UNION, 2}, diff --git a/src/rcpp_similarities.cpp b/src/rcpp_similarities.cpp index 7e5d3df..17f2da7 100644 --- a/src/rcpp_similarities.cpp +++ b/src/rcpp_similarities.cpp @@ -1,7 +1,6 @@ # include // [[Rcpp::depends("RcppArmadillo")]] // [[Rcpp::plugins(openmp)]] -// [[Rcpp::plugins(cpp11)]] // [[Rcpp::depends(BH)]] #include diff --git a/src/term_matrix.h b/src/term_matrix.h index 075931a..fd19166 100644 --- a/src/term_matrix.h +++ b/src/term_matrix.h @@ -453,7 +453,7 @@ class term_matrix { if (count + 1 == tmp_print_rows) { - Rprintf("\rtotal.number.lines.processed.tokenization: %3d", count + 1); + Rprintf("\rtotal.number.lines.processed.tokenization: %3lld", count + 1); tmp_print_rows += print_every_rows; } @@ -644,7 +644,7 @@ class term_matrix { if (count + 1 == tmp_print_rows) { - Rprintf("\rtotal.number.lines.processed.tokenization: %3d", count + 1); + Rprintf("\rtotal.number.lines.processed.tokenization: %3lld", count + 1); tmp_print_rows += print_every_rows; } diff --git a/src/token_big_files.h b/src/token_big_files.h index 7b2c84a..185af99 100644 --- a/src/token_big_files.h +++ b/src/token_big_files.h @@ -859,7 +859,7 @@ class big_files { if (verbose_print <= tmp_mem || flag_peek) { - Rprintf("\rtotal.number.lines.processed: %3d", Lines); + Rprintf("\rtotal.number.lines.processed: %3lld", Lines); Rprintf("\tdata.processed.approx.: %.1f %%", tmp_mem); diff --git a/src/word_vecs_pointer_embedding.cpp b/src/word_vecs_pointer_embedding.cpp index 497af09..f201150 100644 --- a/src/word_vecs_pointer_embedding.cpp +++ b/src/word_vecs_pointer_embedding.cpp @@ -1,7 +1,6 @@ # include // [[Rcpp::depends("RcppArmadillo")]] // [[Rcpp::plugins(openmp)]] -// [[Rcpp::plugins(cpp11)]] // [[Rcpp::depends(BH)]] @@ -237,7 +236,7 @@ class PREPROCESS_WORD_VECS { if (nr_rows + 1 == tmp_print_rows) { - Rprintf("\rtotal.number.lines.processed.input: %3d", nr_rows + 1); + Rprintf("\rtotal.number.lines.processed.input: %3lld", nr_rows + 1); tmp_print_rows += print_every_rows; } @@ -330,7 +329,7 @@ class PREPROCESS_WORD_VECS { if (sec_nr_rows + 1 == sec_tmp_print_rows) { - Rprintf("\rtotal.number.lines.processed.output: %3d", sec_nr_rows + 1); + Rprintf("\rtotal.number.lines.processed.output: %3lld", sec_nr_rows + 1); sec_tmp_print_rows += print_every_rows; }