diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 5137986e..00000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: Build -on: - push: - branches: - - v2-main - - v2-dev - pull_request: - types: [opened, synchronize, reopened] -jobs: - sonarcloud: - name: SonarCloud - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis - - name: SonarCloud Scan - uses: SonarSource/sonarcloud-github-action@master - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any - SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..ca41d77a --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,104 @@ +name: Test +on: + push: + branches: + - v2-main + - v2-dev + - v2/github-actions + pull_request: + types: [opened, synchronize, reopened] +jobs: + lint: + name: Lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: 3.8 + - name: Install deps + run: | + pip install -r requirements.dev.txt + - name: Run lint script + run: sh ./scripts/lint.sh + test: + if: always() + name: "Test with TF ${{ matrix.tensorflow_version }} - ${{ matrix.group }}" + runs-on: ubuntu-latest + strategy: + matrix: + group: [ 1, 2, 3, 4, 5, 6 ] + tensorflow_version: [ 2.2.0, 2.3.0 ] + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: 3.8 + - name: Install deps + run: | + python -m pip install --upgrade pip + pip install tensorflow==${{ matrix.tensorflow_version }} + pip install -r requirements.dev.txt + pip install -r requirements.txt + + - name: Run pytest + run: 'pytest + --doctest-modules + --junitxml=test-reports/junit-${{ matrix.tensorflow_version }}-${{ matrix.group }}.xml + --cov=kashgari + --cov-report=xml:cov-reports/coverage-${{ matrix.tensorflow_version }}-${{ matrix.group }}.xml + --cov-report term + --cov-config .coveragerc + --cov + --splits 6 + --group ${{ matrix.group }} + tests/' + + - name: Upload unit test + uses: actions/upload-artifact@v2 + with: + name: junitxml-${{ matrix.tensorflow_version }}-${{ matrix.group }} + path: test-reports + + - name: Upload coverage + uses: actions/upload-artifact@v2 + with: + name: coverage-${{ matrix.tensorflow_version }}-${{ matrix.group }} + path: cov-reports + + sonarcloud: + if: "!contains(github.event.head_commit.message, 'skip ci')" + name: SonarCloud + runs-on: ubuntu-latest + needs: test + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis + - uses: actions/download-artifact@v2 + with: + path: artifacts + - name: Display structure of downloaded files + run: ls -R + - name: Copy Artifacts to target file + run: | + mkdir -p test-reports && cp artifacts/junit*/* test-reports + mkdir -p cov-reports && cp artifacts/cov*/* cov-reports + - name: Display structure of downloaded files + run: ls -R + - name: SonarCloud Scan + uses: SonarSource/sonarcloud-github-action@master + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any + SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} +# - name: Publish Unit Test Results +# uses: EnricoMi/publish-unit-test-result-action@v1.3 +# if: always() +# with: +# github_token: ${{ secrets.GITHUB_TOKEN }} +# check_name: Unit Test Results +# files: test-results/*.xml +# report_individual_runs: true +# deduplicate_classes_by_file_name: false diff --git a/.readthedocs.yml b/.readthedocs.yml index e250fcd7..6feebaa2 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -21,8 +21,8 @@ formats: # Optionally set the version of Python and requirements required to build your docs python: - version: 3.7 + version: 3.8 install: - - requirements: ./docs/requirements.txt + - requirements: ./requirements.dev.txt - requirements: ./requirements.txt diff --git a/.test_durations b/.test_durations new file mode 100644 index 00000000..5d7277c6 --- /dev/null +++ b/.test_durations @@ -0,0 +1,238 @@ +[ + [ + "tests/test_corpus.py::TestChineseDailyNerCorpus::test_load_data", + 0.791890680999999 + ], + [ + "tests/test_corpus.py::TestSMP2018ECDTCorpus::test_load_data", + 0.07738860300000105 + ], + [ + "tests/test_generator.py::TestGenerator::test_batch_generator", + 0.1659608910000001 + ], + [ + "tests/test_generator.py::TestGenerator::test_corpus_generator", + 0.0004785480000002451 + ], + [ + "tests/test_generator.py::TestGenerator::test_huge_batch_size", + 0.0017857870000010934 + ], + [ + "tests/test_tokenizers.py::TestUtils::test_base_tokenizer", + 0.00045628799999875014 + ], + [ + "tests/test_tokenizers.py::TestUtils::test_bert_tokenizer", + 0.00681187300000019 + ], + [ + "tests/test_tokenizers.py::TestUtils::test_jieba_tokenizer", + 3.054017858000001 + ], + [ + "tests/test_utils.py::TestUtils::test_get_list_subset", + 0.0004596760000001865 + ], + [ + "tests/test_utils.py::TestUtils::test_unison_shuffled_copies", + 0.0008055950000001033 + ], + [ + "tests/test_classification/test_bi_gru_model.py::TestBiGRU_Model::test_basic_use", + 26.957131174000004 + ], + [ + "tests/test_classification/test_bi_gru_model.py::TestBiGRU_Model::test_multi_label", + 20.192117628000005 + ], + [ + "tests/test_classification/test_bi_gru_model.py::TestBiGRU_Model::test_with_word_embedding", + 11.372548664 + ], + [ + "tests/test_classification/test_bi_lstm_model.py::TestBiLSTM_Model::test_basic_use", + 28.27500425000001 + ], + [ + "tests/test_classification/test_bi_lstm_model.py::TestBiLSTM_Model::test_multi_label", + 28.04191687800001 + ], + [ + "tests/test_classification/test_bi_lstm_model.py::TestBiLSTM_Model::test_with_word_embedding", + 11.645751617000016 + ], + [ + "tests/test_classification/test_cnn_attention_model.py::TestCnnAttention_Model::test_basic_use", + 2.8881167019999907 + ], + [ + "tests/test_classification/test_cnn_attention_model.py::TestCnnAttention_Model::test_multi_label", + 14.609918974999971 + ], + [ + "tests/test_classification/test_cnn_attention_model.py::TestCnnAttention_Model::test_with_word_embedding", + 1.442862555000005 + ], + [ + "tests/test_classification/test_cnn_gru_model.py::TestCNN_GRU_Model::test_basic_use", + 5.812660918000006 + ], + [ + "tests/test_classification/test_cnn_gru_model.py::TestCNN_GRU_Model::test_multi_label", + 6.379847061999982 + ], + [ + "tests/test_classification/test_cnn_gru_model.py::TestCNN_GRU_Model::test_with_word_embedding", + 3.058245263999993 + ], + [ + "tests/test_classification/test_cnn_lstm_model.py::TestCNN_LSTM_Model::test_basic_use", + 6.5338332440000215 + ], + [ + "tests/test_classification/test_cnn_lstm_model.py::TestCNN_LSTM_Model::test_multi_label", + 7.435480620999982 + ], + [ + "tests/test_classification/test_cnn_lstm_model.py::TestCNN_LSTM_Model::test_with_word_embedding", + 2.917810065999987 + ], + [ + "tests/test_classification/test_cnn_model.py::TestBiGRU_Model::test_basic_use", + 1.4649902729999837 + ], + [ + "tests/test_classification/test_cnn_model.py::TestBiGRU_Model::test_multi_label", + 2.6597315669999944 + ], + [ + "tests/test_classification/test_cnn_model.py::TestBiGRU_Model::test_with_word_embedding", + 1.2886368380000022 + ], + [ + "tests/test_embeddings/test_bare_embedding.py::TestBareEmbedding::test_base_cases", + 0.10868100899998012 + ], + [ + "tests/test_embeddings/test_bare_embedding.py::TestBareEmbedding::test_with_model", + 4.935055361000025 + ], + [ + "tests/test_embeddings/test_transformer_embedding.py::TestBareEmbedding::test_base_cases", + 0.10724691600000824 + ], + [ + "tests/test_embeddings/test_transformer_embedding.py::TestBareEmbedding::test_with_model", + 5.357032331999989 + ], + [ + "tests/test_embeddings/test_transformer_embedding.py::TestTransferEmbedding::test_base_cases", + 1.2804299539999988 + ], + [ + "tests/test_embeddings/test_transformer_embedding.py::TestTransferEmbedding::test_with_model", + 6.972018837000036 + ], + [ + "tests/test_embeddings/test_word_embedding.py::TestBareEmbedding::test_base_cases", + 0.10442442199999391 + ], + [ + "tests/test_embeddings/test_word_embedding.py::TestBareEmbedding::test_with_model", + 5.0369742190000295 + ], + [ + "tests/test_embeddings/test_word_embedding.py::TestWordEmbedding::test_base_cases", + 0.24649433999999815 + ], + [ + "tests/test_embeddings/test_word_embedding.py::TestWordEmbedding::test_with_model", + 5.744442873999986 + ], + [ + "tests/test_labeling/test_bi_gru_crf_model.py::TestBiGRU_CRF_Model::test_basic_use", + 27.22844565699998 + ], + [ + "tests/test_labeling/test_bi_gru_crf_model.py::TestBiGRU_CRF_Model::test_with_bert", + 15.653805492000004 + ], + [ + "tests/test_labeling/test_bi_gru_crf_model.py::TestBiGRU_CRF_Model::test_with_word_embedding", + 15.798518177000062 + ], + [ + "tests/test_labeling/test_bi_gru_model.py::TestBiGRU_Model::test_basic_use", + 22.86519840400001 + ], + [ + "tests/test_labeling/test_bi_gru_model.py::TestBiGRU_Model::test_predict_and_callback", + 11.08044686200003 + ], + [ + "tests/test_labeling/test_bi_gru_model.py::TestBiGRU_Model::test_with_bert", + 13.311688684999979 + ], + [ + "tests/test_labeling/test_bi_gru_model.py::TestBiGRU_Model::test_with_word_embedding", + 13.12735856400002 + ], + [ + "tests/test_labeling/test_bi_lstm_crf_model.py::TestBiLSTM_CRF_Model::test_basic_use", + 30.706889874000012 + ], + [ + "tests/test_labeling/test_bi_lstm_crf_model.py::TestBiLSTM_CRF_Model::test_with_bert", + 17.221633065999924 + ], + [ + "tests/test_labeling/test_bi_lstm_crf_model.py::TestBiLSTM_CRF_Model::test_with_word_embedding", + 17.035713270999963 + ], + [ + "tests/test_labeling/test_bi_lstm_model.py::TestBiLSTM_Model::test_basic_use", + 27.88147675900001 + ], + [ + "tests/test_labeling/test_bi_lstm_model.py::TestBiLSTM_Model::test_with_bert", + 14.351038211999992 + ], + [ + "tests/test_labeling/test_bi_lstm_model.py::TestBiLSTM_Model::test_with_word_embedding", + 13.007894668000006 + ], + [ + "tests/test_labeling/test_cnn_lstm_model.py::TestCNN_LSTM_Model::test_basic_use", + 24.476096025000004 + ], + [ + "tests/test_labeling/test_cnn_lstm_model.py::TestCNN_LSTM_Model::test_with_bert", + 13.546466815999906 + ], + [ + "tests/test_labeling/test_cnn_lstm_model.py::TestCNN_LSTM_Model::test_with_word_embedding", + 12.68343920000001 + ], + [ + "tests/test_processor/test_class_processor.py::TestClassificationProcessor::test_multi_label_processor", + 1.1768544880000036 + ], + [ + "tests/test_processor/test_class_processor.py::TestClassificationProcessor::test_processor", + 0.001395669999965321 + ], + [ + "tests/test_processor/test_sequence_processor.py::TestSequenceProcessor::test_label_processor", + 0.01879332600003636 + ], + [ + "tests/test_processor/test_sequence_processor.py::TestSequenceProcessor::test_text_processor", + 0.021944461000089177 + ], + [ + "tests/test_seq2seq/test_seq2seq.py::TestSeq2Seq::test_base_use_case", + 130.10345570900006 + ] +] \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 4a3c97c9..00000000 --- a/.travis.yml +++ /dev/null @@ -1,69 +0,0 @@ -language: python -dist: xenial - -env: - global: - - COVERALLS_PARALLEL=true - matrix: - - TF_VERSION="2.1.0" TEST_FILE="tests/test_seq2seq/" - - TF_VERSION="2.1.0" TEST_FILE="tests/test_labeling/test_bi_gru_model.py tests/test_labeling/test_bi_lstm_model.py tests/test_labeling/test_cnn_lstm_model.py" - - TF_VERSION="2.1.0" TEST_FILE="tests/test_labeling/test_bi_gru_crf_model.py tests/test_labeling/test_bi_lstm_crf_model.py" - - TF_VERSION="2.1.0" TEST_FILE="tests/test_classification/" - - TF_VERSION="2.1.0" TEST_FILE="tests/test_embeddings/" - - TF_VERSION="2.1.0" TEST_FILE="tests/test_processor/" - - TF_VERSION="2.1.0" TEST_FILE="tests/test_corpus.py tests/test_utils.py tests/test_tokenizers.py" - - - TF_VERSION="2.2.0" TEST_FILE="tests/test_seq2seq/" - - TF_VERSION="2.2.0" TEST_FILE="tests/test_labeling/test_bi_gru_model.py tests/test_labeling/test_bi_lstm_model.py tests/test_labeling/test_cnn_lstm_model.py" - - TF_VERSION="2.2.0" TEST_FILE="tests/test_labeling/test_bi_gru_crf_model.py tests/test_labeling/test_bi_lstm_crf_model.py" - - TF_VERSION="2.2.0" TEST_FILE="tests/test_classification/" - - TF_VERSION="2.2.0" TEST_FILE="tests/test_embeddings/" - - TF_VERSION="2.2.0" TEST_FILE="tests/test_processor/" - - TF_VERSION="2.2.0" TEST_FILE="tests/test_corpus.py tests/test_utils.py tests/test_tokenizers.py" - - - TF_VERSION="2.3.0" TEST_FILE="tests/test_seq2seq/" - - TF_VERSION="2.3.0" TEST_FILE="tests/test_labeling/test_bi_gru_model.py tests/test_labeling/test_bi_lstm_model.py tests/test_labeling/test_cnn_lstm_model.py" - - TF_VERSION="2.3.0" TEST_FILE="tests/test_labeling/test_bi_gru_crf_model.py tests/test_labeling/test_bi_lstm_crf_model.py" - - TF_VERSION="2.3.0" TEST_FILE="tests/test_classification/" - - TF_VERSION="2.3.0" TEST_FILE="tests/test_embeddings/" - - TF_VERSION="2.3.0" TEST_FILE="tests/test_processor/" - - TF_VERSION="2.3.0" TEST_FILE="tests/test_corpus.py tests/test_utils.py tests/test_tokenizers.py" - -python: - - "3.6" - -cache: pip - -before_install: - - export BOTO_CONFIG=/dev/null - -stages: - - Lint - - Test - -install: - - pip install tensorflow==$TF_VERSION - - pip install -r requirements.dev.txt - - pip install -r requirements.txt - - git fetch --unshallow --quiet - - export PYTHONPATH=`pwd` - -script: - - TARGET="2.1.0"; if [ "$TF_VERSION" == "$TARGET" ]; then pip install "tensorflow_addons<0.10.0";else echo "No need to install."; fi - - python -c "import kashgari;print(f'kashgari version {kashgari.__version__}')" - - pytest --doctest-modules --junitxml=test-reports/junit.xml --cov=kashgari --cov-report=xml:coverage.xml --cov-report term --cov-report=html:htmlcov --cov-config .coveragerc $TEST_FILE - -after_script: - - coveralls - -notifications: - webhooks: https://coveralls.io/webhook - -jobs: - include: - - stage: Lint - python: "3.7" - install: - - pip install -r requirements.dev.txt - script: - - sh ./scripts/lint.sh diff --git a/docs/about/release-notes.md b/docs/about/release-notes.md index f6a8b5d5..546c24a3 100644 --- a/docs/about/release-notes.md +++ b/docs/about/release-notes.md @@ -17,6 +17,11 @@ pip show kashgari ## Current Release +### [2.0.1] - 2020.10.28 + +- ✨ Add `convert_to_saved_model` API for tf-serving use case. +- ✨ Add tf-serving documents. + ### [2.0.0] - 2020.09.10 This is a fully re-implemented version with TF2. diff --git a/docs/advance-use/tensorflow-serving.md b/docs/advance-use/tensorflow-serving.md new file mode 100644 index 00000000..d25ec32e --- /dev/null +++ b/docs/advance-use/tensorflow-serving.md @@ -0,0 +1,121 @@ +# Tensorflow Serving + +```python +from kashgari.tasks.classification import BiGRU_Model +from kashgari.corpus import SMP2018ECDTCorpus +from kashgari import utils + +train_x, train_y = SMP2018ECDTCorpus.load_data() + +model = BiGRU_Model() +model.fit(train_x, train_y) + +# Save model +utils.convert_to_saved_model(model, + model_path="saved_model/bgru", + version=1) +``` + +Then run tensorflow-serving. + +```bash +docker run -t --rm -p 8501:8501 -v "/saved_model:/models/" -e MODEL_NAME=bgru tensorflow/serving +``` + +Load processor from model, then predict with serving. + +We need to check model input keys first. + +```python +import requests +res = requests.get("http://localhost:8501/v1/models/bgru/metadata") +inputs = res.json()['metadata']['signature_def']['signature_def']['serving_default']['inputs'] +input_sample_keys = list(inputs.keys()) +print(input_sample_keys) +# ['Input-Token', 'Input-Segment'] +``` + +If we have only one input key, aka we are not using BERT like embedding, + we need to pass json in this format to predict endpoint. + +```json +{ + "instances": [ + [2, 1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [2, 9, 41, 459, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ] +} +``` + +Here is the code. + +```python +import requests +import numpy as np +from kashgari.processors import load_processors_from_model + +text_processor, label_processor = load_processors_from_model('/Users/brikerman/Desktop/tf-serving/1603683152') + +samples = [ + ['hello', 'world'], + ['你', '好', '世', '界'] +] +tensor = text_processor.transform(samples) + +instances = [i.tolist() for i in tensor] + +# predict +r = requests.post("http://localhost:8501/v1/models/bgru:predict", json={"instances": instances}) +predictions = r.json()['predictions'] + +# Convert result back to labels +labels = label_processor.inverse_transform(np.array(predictions).argmax(-1)) +print(labels) +``` + +If we are using Bert, then we need to handle multi input fields, + for example we get this two keys `['Input-Token', 'Input-Segment']` from metadata endpoint. + Then we need to pass a json in this format. + +```json +[ + { + "Input-Token": [2, 1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "Input-Segment": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + }, + { + "Input-Token": [2, 9, 41, 459, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "Input-Segment": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + } +] + +``` + +Here is the code. + +```python +import requests +import numpy as np +from kashgari.processors import load_processors_from_model + +text_processor, label_processor = load_processors_from_model('/Users/brikerman/Desktop/tf-serving/1603683152') + +samples = [ + ['hello', 'world'], + ['你', '好', '世', '界'] +] +tensor = text_processor.transform(samples) + +instances = [{ + "Input-Token": i.tolist(), + "Input-Segment": np.zeros(i.shape).tolist() +} for i in tensor] + +# predict +r = requests.post("http://localhost:8501/v1/models/bgru:predict", json={"instances": instances}) +predictions = r.json()['predictions'] + +# Convert result back to labels +labels = label_processor.inverse_transform(np.array(predictions).argmax(-1)) +print(labels) +``` diff --git a/docs/index.rst b/docs/index.rst index 59af1eae..85887819 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,6 +24,12 @@ embeddings/bert-embedding.rst embeddings/transformer-embedding.rst +.. toctree:: + :maxdepth: 2 + :caption: Advanced Use Cases + + advance-use/tensorflow-serving.md + .. toctree:: :maxdepth: 3 :caption: API diff --git a/kashgari/__version__.py b/kashgari/__version__.py index d4186858..c77c520a 100644 --- a/kashgari/__version__.py +++ b/kashgari/__version__.py @@ -7,4 +7,4 @@ # file: __version__.py.py # time: 2019-05-20 16:32 -__version__ = '2.0.0' +__version__ = '2.0.1' diff --git a/kashgari/corpus.py b/kashgari/corpus.py index 24e63fcf..75dd1c73 100644 --- a/kashgari/corpus.py +++ b/kashgari/corpus.py @@ -176,7 +176,7 @@ def load_data(cls, raise ModuleNotFoundError( "please install jieba, `$ pip install jieba`") x_data = [list(jieba.cut(item)) for item in df['query'].to_list()] - elif 'char': + elif cutter == 'char': x_data = [list(item) for item in df['query'].to_list()] y_data = df['label'].to_list() diff --git a/kashgari/generators.py b/kashgari/generators.py index d390b213..7e036198 100644 --- a/kashgari/generators.py +++ b/kashgari/generators.py @@ -100,6 +100,15 @@ def __iter__(self) -> Iterator: max_position=self.max_position) yield x_tensor, y_tensor batch_x, batch_y = [], [] + if batch_x: + x_tensor = self.text_processor.transform(batch_x, + seq_length=self.seq_length, + max_position=self.max_position, + segment=self.segment) + y_tensor = self.label_processor.transform(batch_y, + seq_length=self.seq_length, + max_position=self.max_position) + yield x_tensor, y_tensor def take(self, batch_count: int = None) -> Any: """ diff --git a/kashgari/processors/__init__.py b/kashgari/processors/__init__.py index 103ff4b4..f860b873 100644 --- a/kashgari/processors/__init__.py +++ b/kashgari/processors/__init__.py @@ -11,5 +11,7 @@ from .class_processor import ClassificationProcessor from .sequence_processor import SequenceProcessor +from .tools import load_processors_from_model + if __name__ == "__main__": pass diff --git a/kashgari/processors/abc_processor.py b/kashgari/processors/abc_processor.py index c8fe8df0..b6c856cc 100644 --- a/kashgari/processors/abc_processor.py +++ b/kashgari/processors/abc_processor.py @@ -42,6 +42,8 @@ def __init__(self, **kwargs: Any) -> None: self.token_bos: str = kwargs.get('token_bos', '[CLS]') # type: ignore self.token_eos: str = kwargs.get('token_eos', '[SEP]') # type: ignore + self._sequence_length_from_saved_model: Optional[int] = None + @property def vocab_size(self) -> int: return len(self.vocab2idx) diff --git a/kashgari/processors/sequence_processor.py b/kashgari/processors/sequence_processor.py index 6b1f63b6..020434ee 100644 --- a/kashgari/processors/sequence_processor.py +++ b/kashgari/processors/sequence_processor.py @@ -106,6 +106,11 @@ def transform(self, max_position: int = None, segment: bool = False) -> np.ndarray: seq_length_from = "" + + # An ugly patch for tf-serving use case. + if seq_length is None and self._sequence_length_from_saved_model is not None: + seq_length = self._sequence_length_from_saved_model + if seq_length is None: seq_length_from = "max length of the samples" seq_length = max([len(i) for i in samples]) + 2 diff --git a/kashgari/processors/tools.py b/kashgari/processors/tools.py new file mode 100644 index 00000000..48d8498b --- /dev/null +++ b/kashgari/processors/tools.py @@ -0,0 +1,34 @@ +# encoding: utf-8 + +# author: BrikerMan +# contact: eliyar917@gmail.com +# blog: https://eliyar.biz + +# file: tools.py +# time: 11:24 上午 + +import json +import os +from typing import Tuple + +from kashgari.processors.abc_processor import ABCProcessor +from kashgari.utils.serialize import load_data_object + + +def load_processors_from_model(model_path: str) -> Tuple[ABCProcessor, ABCProcessor]: + with open(os.path.join(model_path, 'model_config.json'), 'r') as f: + model_config = json.loads(f.read()) + text_processor: ABCProcessor = load_data_object(model_config['text_processor']) + label_processor: ABCProcessor = load_data_object(model_config['label_processor']) + + sequence_length_from_saved_model = model_config['config'].get('sequence_length', None) + text_processor._sequence_length_from_saved_model = sequence_length_from_saved_model + label_processor._sequence_length_from_saved_model = sequence_length_from_saved_model + + return text_processor, label_processor + + +if __name__ == "__main__": + text_processor, label_processor = load_processors_from_model('/Users/brikerman/Desktop/tf-serving/1603683152') + x = text_processor.transform([list('我想你了')]) + print(x.tolist()) diff --git a/kashgari/tasks/abs_task_model.py b/kashgari/tasks/abs_task_model.py index 567856f4..6da8ea99 100644 --- a/kashgari/tasks/abs_task_model.py +++ b/kashgari/tasks/abs_task_model.py @@ -30,14 +30,13 @@ class ABCTaskModel(ABC): def __init__(self) -> None: - self.embedding: ABCEmbedding + self.tf_model: tf.keras.Model = None + self.embedding: ABCEmbedding = None self.hyper_parameters: Dict[str, Any] self.sequence_length: int self.text_processor: ABCProcessor self.label_processor: ABCProcessor - self.tf_model: tf.keras.Model - def to_dict(self) -> Dict[str, Any]: model_json_str = self.tf_model.to_json() @@ -48,6 +47,7 @@ def to_dict(self) -> Dict[str, Any]: '__module__': self.__class__.__module__, 'config': { 'hyper_parameters': self.hyper_parameters, # type: ignore + 'sequence_length': self.sequence_length # type: ignore }, 'embedding': self.embedding.to_dict(), # type: ignore 'text_processor': self.text_processor.to_dict(), @@ -77,11 +77,6 @@ def default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]: raise NotImplementedError def save(self, model_path: str) -> str: - """ - Save model - Args: - model_path: - """ pathlib.Path(model_path).mkdir(exist_ok=True, parents=True) model_path = os.path.abspath(model_path) @@ -106,7 +101,6 @@ def load_model(cls, model_path: str) -> Union["ABCLabelingModel", "ABCClassifica tf_model_str = json.dumps(model_config['tf_model']) - print(tf_model_str) model.tf_model = tf.keras.models.model_from_json(tf_model_str, custom_objects=kashgari.custom_objects) @@ -122,9 +116,3 @@ def build_model(self, x_data: Any, y_data: Any) -> None: raise NotImplementedError - - -if __name__ == "__main__": - path = '/var/folders/x3/_dg9_drj42l_cc70tsqkpqrw0000gn/T/1590915853.4571211' - m = ABCTaskModel.load_model(path) - m.tf_model.summary() diff --git a/kashgari/tasks/classification/bi_gru_model.py b/kashgari/tasks/classification/bi_gru_model.py index 48692a85..ccfd67a7 100644 --- a/kashgari/tasks/classification/bi_gru_model.py +++ b/kashgari/tasks/classification/bi_gru_model.py @@ -44,7 +44,3 @@ def build_model_arc(self) -> None: tensor = layer(tensor) self.tf_model = keras.Model(embed_model.inputs, tensor) - - -if __name__ == "__main__": - pass diff --git a/kashgari/tasks/classification/bi_lstm_model.py b/kashgari/tasks/classification/bi_lstm_model.py index 4aeaa71c..f0efb82f 100644 --- a/kashgari/tasks/classification/bi_lstm_model.py +++ b/kashgari/tasks/classification/bi_lstm_model.py @@ -46,28 +46,3 @@ def build_model_arc(self) -> None: tensor = layer(tensor) self.tf_model: keras.Model = keras.Model(embed_model.inputs, tensor) - - -if __name__ == "__main__": - import logging - - logging.basicConfig(level='DEBUG') - - from kashgari.embeddings import WordEmbedding - - w2v_path = '/Users/brikerman/Desktop/nlp/language_models/w2v/sgns.weibo.bigram-char' - w2v = WordEmbedding(w2v_path, w2v_kwargs={'limit': 10000}) - - from kashgari.corpus import SMP2018ECDTCorpus - - x, y = SMP2018ECDTCorpus.load_data() - - model = BiLSTM_Model(embedding=w2v) - model.fit(x, y) - - # 或者集成 CorpusGenerator 实现自己的数据迭代器 - # train_gen = CorpusGenerator() - # model.fit_generator(train_gen=train_gen, - # valid_gen=valid_gen, - # batch_size=batch_size, - # epochs=epochs) diff --git a/kashgari/tasks/labeling/abc_model.py b/kashgari/tasks/labeling/abc_model.py index 329b6c2a..1fe84747 100644 --- a/kashgari/tasks/labeling/abc_model.py +++ b/kashgari/tasks/labeling/abc_model.py @@ -265,7 +265,6 @@ def predict(self, else: seq_length = None - print(self.crf_layer) tensor = self.text_processor.transform(x_data, segment=self.embedding.segment, seq_length=seq_length, diff --git a/kashgari/utils/__init__.py b/kashgari/utils/__init__.py index e15cc02b..ccacb99d 100644 --- a/kashgari/utils/__init__.py +++ b/kashgari/utils/__init__.py @@ -17,6 +17,7 @@ from .data import unison_shuffled_copies from .multi_label import MultiLabelBinarizer from .serialize import load_data_object +from .model import convert_to_saved_model if TYPE_CHECKING: from kashgari.tasks.labeling import ABCLabelingModel diff --git a/kashgari/utils/model.py b/kashgari/utils/model.py new file mode 100644 index 00000000..cdcde206 --- /dev/null +++ b/kashgari/utils/model.py @@ -0,0 +1,52 @@ +# encoding: utf-8 + +# author: BrikerMan +# contact: eliyar917@gmail.com +# blog: https://eliyar.biz + +# file: model.py +# time: 10:57 上午 + +import json +import os +import pathlib +import time +from typing import Union, Any + +from kashgari.tasks.abs_task_model import ABCTaskModel + + +def convert_to_saved_model(model: ABCTaskModel, + model_path: str, + version: Union[str, int] = None, + signatures: Any = None, + options: Any = None) -> None: + """ + Export model for tensorflow serving + Args: + model: Target model. + model_path: The path to which the SavedModel will be stored. + version: The model version code, default timestamp + signatures: Signatures to save with the SavedModel. Applicable to the + 'tf' format only. Please see the `signatures` argument in + `tf.saved_model.save` for details. + options: Optional `tf.saved_model.SaveOptions` object that specifies + options for saving to SavedModel. + + """ + if not isinstance(model, ABCTaskModel): + raise ValueError("Only supports the classification model and labeling model") + if version is None: + version = round(time.time()) + export_path = os.path.join(model_path, str(version)) + + pathlib.Path(export_path).mkdir(exist_ok=True, parents=True) + model.tf_model.save(export_path, save_format='tf', signatures=signatures, options=options) + + with open(os.path.join(export_path, 'model_config.json'), 'w') as f: + f.write(json.dumps(model.to_dict(), indent=2, ensure_ascii=True)) + f.close() + + +if __name__ == "__main__": + pass diff --git a/requirements.dev.txt b/requirements.dev.txt index ad075ea5..8bc9b794 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -4,6 +4,7 @@ flake8-builtins mypy pytest>=5.4.3 pytest-cov +pytest-split coveralls # documents diff --git a/requirements.txt b/requirements.txt index 4ddbdd54..79a0c2d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,9 @@ numpy>=1.18.1 gensim>=3.8.1 pandas>=1.0.1 tqdm -bert4keras>=0.7.9 +# Limit this version to avoid json serilization issue. +# See https://github.com/bojone/bert4keras/issues/241 +bert4keras>=0.9.1 scikit-learn>=0.21.1 tensorflow>=2.1.0 tensorflow_addons diff --git a/sonar-project.properties b/sonar-project.properties index ff531d67..7d27479f 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -10,3 +10,6 @@ sonar.organization=brikerman-github # Encoding of the source code. Default is default system encoding #sonar.sourceEncoding=UTF-8 + +sonar.python.coverage.reportPaths=artifacts/coverage*/coverage*.xml +sonar.python.xunit.reportPath=artifacts/junit*/junit-*.xml diff --git a/tests/test_classification/test_bi_lstm_model.py b/tests/test_classification/test_bi_lstm_model.py index 87b66f66..5a2b8fba 100644 --- a/tests/test_classification/test_bi_lstm_model.py +++ b/tests/test_classification/test_bi_lstm_model.py @@ -8,16 +8,14 @@ # time: 1:57 下午 import os +import tempfile import time import unittest -import tempfile -import numpy as np - -from tests.test_macros import TestMacros from kashgari.corpus import SMP2018ECDTCorpus from kashgari.embeddings import WordEmbedding from kashgari.tasks.classification import BiLSTM_Model +from tests.test_macros import TestMacros class TestBiLSTM_Model(unittest.TestCase): @@ -63,6 +61,15 @@ def test_basic_use(self): # Make sure use sigmoid as activation function assert new_model.tf_model.layers[-1].activation.__name__ == 'softmax' + # TF Serving Test + from kashgari.utils import convert_to_saved_model + convert_to_saved_model(new_model, + os.path.join(model_path, 'serving'), + version=1) + + from kashgari.processors import load_processors_from_model + _ = load_processors_from_model(os.path.join(model_path, 'serving', '1')) + def test_multi_label(self): corpus = TestMacros.jigsaw_mini_corpus model = self.TASK_MODEL_CLASS(sequence_length=20, multi_label=True) @@ -112,6 +119,5 @@ def test_with_word_embedding(self): _ = new_model.predict(valid_x[:20]) - if __name__ == '__main__': unittest.main() diff --git a/tests/test_generator.py b/tests/test_generator.py index 0aa9b29f..1956ba80 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -32,17 +32,42 @@ def test_batch_generator(self): text_processor.build_vocab_generator([corpus_gen]) label_processor.build_vocab_generator([corpus_gen]) - batch_dataset1 = BatchDataSet(corpus_gen, - text_processor=text_processor, - label_processor=label_processor, - segment=False, - seq_length=None, - max_position=100, - batch_size=12) - - duplicate_len = len(batch_dataset1) - assert len(list(batch_dataset1.take(duplicate_len))) == duplicate_len - assert len(list(batch_dataset1.take(1))) == 1 + batch_dataset = BatchDataSet(corpus_gen, + text_processor=text_processor, + label_processor=label_processor, + segment=False, + seq_length=None, + max_position=100, + batch_size=12) + + duplicate_len = len(batch_dataset) + assert len(list(batch_dataset.take(duplicate_len))) == duplicate_len + assert len(list(batch_dataset.take(1))) == 1 + + def test_huge_batch_size(self): + x, y = [['this', 'is', 'Jack', 'Ma']], [['O', 'O', 'B', 'I']] + + text_processor = SequenceProcessor() + label_processor = SequenceProcessor(build_vocab_from_labels=True, min_count=1) + + corpus_gen = CorpusGenerator(x, y) + + text_processor.build_vocab_generator([corpus_gen]) + label_processor.build_vocab_generator([corpus_gen]) + + batch_dataset = BatchDataSet(corpus_gen, + text_processor=text_processor, + label_processor=label_processor, + segment=False, + seq_length=None, + max_position=100, + batch_size=512) + + for x_b, y_b in batch_dataset.take(1): + print(y_b.shape) + duplicate_len = len(batch_dataset) + assert len(list(batch_dataset.take(duplicate_len))) == duplicate_len + assert len(list(batch_dataset.take(1))) == 1 if __name__ == '__main__': diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py index c71b38f9..fee5ffd0 100644 --- a/tests/test_tokenizers.py +++ b/tests/test_tokenizers.py @@ -17,12 +17,12 @@ class TestUtils(unittest.TestCase): def test_jieba_tokenizer(self): - os.system("pip uninstall -y jieba") + os.system("pip3 uninstall -y jieba") with self.assertRaises(ModuleNotFoundError): _ = JiebaTokenizer() - os.system("pip install jieba") + os.system("pip3 install jieba") t = JiebaTokenizer() assert ['你好', '世界', '!', ' ', 'Hello', ' ', 'World'] == t.tokenize('你好世界! Hello World')