rth · rth · Dec 10, 2018 · Nov 26, 2018 · Nov 26, 2018 · Nov 26, 2018
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -1,13 +1,10 @@
 version: 2
 
-defaults: &defaults
-  working_directory: ~/repo
-  docker:
-    - image: rust:1.30.1-slim-stretch
-
 jobs:
-  build:
-    <<: *defaults
+  rust-stable:
+    working_directory: ~/repo
+    docker:
+      - image: rust:1.30.1-slim-stretch
     steps:
       - checkout
       - run:
@@ -26,23 +23,59 @@ jobs:
           command: |
             cargo test
 
+  python-wrapper:
+    working_directory: ~/repo
+    docker:
+      - image: rustlang/rust:nightly
+    steps:
+      - checkout
+      - run:
+          name: dependencies
+          command: |
+            apt-get update
+            apt-get install -y python3-pip
+            pip3 install -r python/requirements.txt
+            pip3 install pytest pytest-cov
+
+      - run:
+          name: build
+          command: |
+            cd python/
+            python3 setup.py develop
+
+      - run:
+          name: test
+          command: |
+            cd python
+            python3 -m pytest text_vectorize
+
   lint:
-    <<: *defaults
+    working_directory: ~/repo
+    docker:
+      - image: rthz/rust-nightly-python37
     steps:
       - checkout
 
       - run:
           name: dependencies
-          command: rustup component add rustfmt-preview
+          command: |
+            apt-get update
+            rustup component add rustfmt-preview
+            python -m pip install flake8 black
 
       - run:
           name: lint
           command: |
             cargo fmt -- --check
+            flake8 --max-line-length=88 python/ benchmarks/
+            black --check python/ benchmarks/
+            cd python && cargo fmt -- --check
+
 
 workflows:
   version: 2
   build:
     jobs:
-      - build
+      - rust-stable
+      - python-wrapper
       - lint
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,8 @@ wheels/
 .installed.cfg
 *.egg
 MANIFEST
+__pycache__/
+*.pyc
 
 # Unit test / coverage reports
 htmlcov/

diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ estimators in [scikit-learn](https://scikit-learn.org/).
 
 ### Planned
 
- - Python wrapper ([#1](https://github.com/rth/text-vectorize/pull/1))
+ - Python wrapper ([#6](https://github.com/rth/text-vectorize/pull/6))
  - Support for word and character n-grams ([#2](https://github.com/rth/text-vectorize/issues/2))
  - Binary Python wheels ([#3](https://github.com/rth/text-vectorize/issues/3<Paste>))
  - IDF transforms and TfidfVectorizer ([#4](https://github.com/rth/text-vectorize/issues/4))

diff --git a/benchmarks/bench_naive_counter.py b/benchmarks/bench_naive_counter.py
diff --git a/benchmarks/bench_scikit-learn.py b/benchmarks/bench_scikit-learn.py
@@ -1,28 +1,48 @@
 from time import time
 from glob import glob
+from scipy.sparse import csr_matrix
 
 from sklearn.feature_extraction.text import HashingVectorizer
 from sklearn.feature_extraction.text import CountVectorizer
 
-if __name__ == '__main__':
-    input_files = list(glob('./data/*/*'))
+from text_vectorize._lib import hash_vectorize
+
+if __name__ == "__main__":
+    input_files = list(glob("./data/*/*"))
     data = []
     for file_path in input_files:
-        with open(file_path, 'rt') as fh:
+        with open(file_path, "rt") as fh:
             data.append(fh.read())
 
     t0 = time()
-    vect = CountVectorizer(lowercase=False)
-    vect.fit_transform(data)
+    indices, indptr, data_out = hash_vectorize(data)
+    csr_matrix((data_out, indices, indptr))
 
     dt = time() - t0
 
-    print(f"CountVectorizer: vectorized {len(data)} documents in {dt:.2f}s")
+    print(
+        "HashingVectorizer (text-vectorize): vectorized {}"
+        "documents in {:.2f}s".format(len(data), dt)
+    )
 
     t0 = time()
     vect = HashingVectorizer(lowercase=False)
     vect.fit_transform(data)
 
     dt = time() - t0
 
-    print(f"HashingVectorizer: vectorized {len(data)} documents in {dt:.2f}s")
+    print(
+        "HashingVectorizer (scikit-learn): vectorized {}"
+        "documents in {:.2f}s".format(len(data), dt)
+    )
+
+    t0 = time()
+    vect = CountVectorizer(lowercase=False)
+    vect.fit_transform(data)
+
+    dt = time() - t0
+
+    print(
+        "CountVectorizer (scikit-learn): vectorized {}"
+        "documents in {:.2f}s".format(len(data), dt)
+    )
diff --git a/python/Cargo.toml b/python/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "text-vectorize-py-wrapper"
+version = "0.1.0-alpha.1"
+authors = ["Roman Yurchak <[email protected]>"]
+
+[lib]
+name = "rust_ext"
+crate-type = ["cdylib"]
+
+[dependencies]
+ndarray = "0.12"
+text-vectorize = {"path" = "../"}
+
+[dependencies.numpy]
+version = "0.4.0"
+git = "https://github.com/rust-numpy/rust-numpy"
+features = ["python3"]
+
+[dependencies.pyo3]
+version = "0.5.2"
+features = ["extension-module"]
diff --git a/python/Dockerfile b/python/Dockerfile
@@ -0,0 +1,16 @@
+FROM rustlang/rust:nightly
+
+COPY requirements.txt .
+
+RUN wget https://www.python.org/ftp/python/3.7.1/Python-3.7.1.tgz && \
+    tar xzf Python-3.7.1.tgz && \
+    cd Python-3.7.1 && \
+    ./configure --enable-optimizations && \
+    make build_all -j3 && \
+    make altinstall && \
+    update-alternatives --install /usr/bin/python python /usr/local/bin/python3.7 50 && \
+    cd .. &&  rm -rf Python-3.7.1*
+RUN python -m pip install -r requirements.txt && \
+    python -m pip install pytest flake8
+WORKDIR /src/python
+
diff --git a/python/README.md b/python/README.md
@@ -0,0 +1,29 @@
+# py-text-vectorize
+
+This is a Python wrapper for the Rust text-vectorize crate.
+
+WIP.
+
+## Installation
+
+### Manual install
+
+This requires Python 3.5+ as well as Rust nightly >=1.30.0
+(due to [rust-numpy](https://github.com/rust-numpy/rust-numpy) and
+[pyo3](https://github.com/PyO3/pyo3) requirements),
+
+To build the Python package, run,
+```
+pip install -r requirements.txt
+python3 setup.py develop --user
+```
+
+### Docker environment
+
+The easiest might be to use docker to setup a build environment,
+
+```
+docker build -t py-text-vectorize-env .
+./start_docker_env.sh
+python3 setup.py develop --user
+```
diff --git a/python/requirements.txt b/python/requirements.txt
@@ -0,0 +1,4 @@
+setuptools
+setuptools-rust>=0.10.2
+numpy>=1.15.0
+scipy>=1.1.0
diff --git a/python/run_docker_env.sh b/python/run_docker_env.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+docker run --rm -v $PWD/../:/src -it rthz/rust-nightly-python37 /bin/bash
diff --git a/python/setup.py b/python/setup.py
@@ -0,0 +1,24 @@
+from setuptools import find_packages, setup
+from setuptools_rust import RustExtension
+
+
+with open("./requirements.txt", "rt") as fh:
+    install_requires = fh.read().splitlines()
+
+setup(
+    name="py-text-vectorize",
+    version="0.1.0a1",
+    description="Example of python-extension using rust-numpy",
+    rust_extensions=[
+        RustExtension(
+            "text_vectorize._lib",
+            "./Cargo.toml",
+            rustc_flags=["--cfg=Py_3"],
+            features=["numpy/python3"],
+        )
+    ],
+    install_requires=install_requires,
+    packages=find_packages(),
+    python_requires="~=3.5",
+    zip_safe=False,
+)
diff --git a/python/src/lib.rs b/python/src/lib.rs
@@ -0,0 +1,60 @@
+#![feature(specialization)]
+
+#[macro_use]
+extern crate ndarray;
+extern crate numpy;
+extern crate pyo3;
+extern crate text_vectorize;
+
+use numpy::{IntoPyArray, PyArray1};
+use pyo3::prelude::{pymodinit, ObjectProtocol, Py, PyModule, PyObject, PyResult, Python};
+use pyo3::types::PyIterator;
+use text_vectorize::HashingVectorizer;
+
+use text_vectorize::tokenize;
+
+fn vec_usize_to_i32(vec: Vec<usize>) -> Vec<i32> {
+    let mut vect_out: Vec<i32> = Vec::new();
+    for element in vec.iter() {
+        if *element > std::i32::MAX as usize {
+            panic!("Cannot safely coerce indices to i32!");
+        } else {
+            vect_out.push(*element as i32);
+        }
+    }
+    vect_out
+}
+
+#[pymodinit]
+fn _lib(_py: Python, m: &PyModule) -> PyResult<()> {
+    #[pyfn(m, "hash_vectorize")]
+    fn hash_vectorize(
+        py: Python,
+        x: PyObject,
+    ) -> PyResult<(Py<PyArray1<i32>>, Py<PyArray1<i32>>, Py<PyArray1<i32>>)> {
+        let text = PyIterator::from_object(py, &x)?;
+
+        let mut collection: Vec<String> = Vec::new();
+
+        for document in text {
+            let document = document?;
+            let document = ObjectProtocol::extract::<String>(document)?;
+            collection.push(document);
+        }
+
+        let mut vect = HashingVectorizer::new();
+        let x = vect.fit_transform(&collection);
+
+        let indices = vec_usize_to_i32(x.indices);
+        let indptr = vec_usize_to_i32(x.indptr);
+        let data = x.data;
+
+        Ok((
+            indices.into_pyarray(py).to_owned(),
+            indptr.into_pyarray(py).to_owned(),
+            data.into_pyarray(py).to_owned(),
+        ))
+    }
+
+    Ok(())
+}
diff --git a/python/text_vectorize/__init__.py b/python/text_vectorize/__init__.py
@@ -0,0 +1,2 @@
+from . import _lib  # noqa
+from .base import HashingVectorizer  # noqa
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/bin/sh
		docker run --rm -v $PWD/../:/src -it rthz/rust-nightly-python37 /bin/bash
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from . import _lib # noqa
		from .base import HashingVectorizer # noqa