#76 updated AREkit, updated DeepPavlov default library version

nicolay-r · Aug 16, 2023 · 889c561 · 889c561
1 parent c488325
commit 889c561
Show file tree

Hide file tree

Showing 13 changed files with 45 additions and 51 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
-# ARElight 0.23.2
+# ARElight 0.24.0
 
-![](https://img.shields.io/badge/Python-3.6.9-brightgreen.svg)
+![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
 ![](https://img.shields.io/badge/AREkit-0.23.1-orange.svg)
 
 ### :point_right: [DEMO](https://github.com/nicolay-r/ARElight/tree/v0.22.0#installation) :point_left:
@@ -21,7 +21,7 @@ we adopt [DeepPavlov](https://github.com/deeppavlovteam/DeepPavlov)  (BertOntoNo
 
 1. Main library installation
 ```bash
-pip install git+https://github.com/nicolay-r/arelight@v0.23.1
+pip install git+https://github.com/nicolay-r/arelight@v0.24.0
 ```
 
 2. (Optional) BRAT: [Download](https://github.com/nlplab/brat/releases/tag/v1.3_Crunchy_Frog) 

diff --git a/arelight/brat_backend.py b/arelight/brat_backend.py
@@ -1,3 +1,4 @@
+from arekit.common.docs.entity import DocumentEntity
 from arekit.contrib.networks.input.const import FrameVariantIndices
 from arekit.contrib.networks.input.rows_parser import ParsedSampleRow
 from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
@@ -11,7 +12,6 @@
 from arekit.common.data.storages.base import BaseRowsStorage
 from arekit.common.entities.base import Entity
 from arekit.common.frames.variants.base import FrameVariant
-from arekit.common.news.entity import DocumentEntity
 from arekit.contrib.utils.processing.text.tokens import Tokens
 
 
@@ -114,7 +114,7 @@ def __iter_sample_labels(samples, label_to_rel):
         assert(isinstance(samples, BaseRowsStorage))
 
         for row_ind, row in samples:
-            str_label = str(row[const.LABEL]) if const.LABEL in row else None
+            str_label = str(row[const.LABEL_UINT]) if const.LABEL_UINT in row else None
             label = label_to_rel[str_label] if str_label in label_to_rel else None
             yield row_ind, label
 

diff --git a/arelight/doc_ops.py b/arelight/doc_ops.py
@@ -1,7 +1,7 @@
-from arekit.common.experiment.api.ops_doc import DocumentOperations
+from arekit.common.data.doc_provider import DocumentProvider
 
 
-class InMemoryDocOperations(DocumentOperations):
+class InMemoryDocOperations(DocumentProvider):
 
     def __init__(self, docs=None):
         assert(isinstance(docs, list) or docs is None)

diff --git a/arelight/pipelines/items/entities_default.py b/arelight/pipelines/items/entities_default.py
@@ -13,7 +13,7 @@ def __process_word(word):
 
         # If this is a special word which is related to the [entity] mention.
         if word[0] == "[" and word[-1] == "]":
-            entity = Entity(value=word[1:-1], e_type=None)
+            entity = Entity(value=word[1:-1], e_type="UNDEFINED")
             return entity
 
         return word

diff --git a/arelight/pipelines/items/entities_ner_dp.py b/arelight/pipelines/items/entities_ner_dp.py
@@ -1,6 +1,6 @@
 from arekit.common.bound import Bound
+from arekit.common.docs.objects_parser import SentenceObjectsParserPipelineItem
 from arekit.common.entities.base import Entity
-from arekit.common.news.objects_parser import SentenceObjectsParserPipelineItem
 from arekit.common.text.partitioning.terms import TermsPartitioning
 
 from arelight.ner.deep_pavlov import DeepPavlovNER

diff --git a/arelight/pipelines/items/train_bert.py b/arelight/pipelines/items/train_bert.py
@@ -47,19 +47,19 @@ def apply_core(self, input_data, pipeline_ctx):
         def __iter_batches(s, batch_size):
             assert(isinstance(s, BaseRowsStorage))
 
-            data = {"text_a": [], "text_b": [], "label": []}
+            data = {"text_a": [], "text_b": [], "label_uint": []}
 
             # NOTE: it is important to iter shuffled data!
             for row_ind, row in s.iter_shuffled():
                 data["text_a"].append(row['text_a'])
                 data["text_b"].append(row['text_b'])
-                data["label"].append(row[const.LABEL])
+                data["label_uint"].append(row[const.LABEL_UINT])
 
             for i in range(0, len(data["text_a"]), batch_size):
 
                 texts_a = data["text_a"][i:i + batch_size]
                 texts_b = data["text_b"][i:i + batch_size]
-                labels = data["label"][i:i + batch_size]
+                labels = data["label_uint"][i:i + batch_size]
 
                 batch_features = self.__proc(texts_a=texts_a, texts_b=texts_b)
 

diff --git a/arelight/pipelines/items/utils.py b/arelight/pipelines/items/utils.py
@@ -1,5 +1,5 @@
-from arekit.common.news.base import News
-from arekit.common.news.sentence import BaseNewsSentence
+from arekit.common.docs.base import Document
+from arekit.common.docs.sentence import BaseDocumentSentence
 
 
 def input_to_docs(input_data, sentence_parser):
@@ -15,8 +15,8 @@ def input_to_docs(input_data, sentence_parser):
     for doc_id, contents in enumerate(input_data):
         # setup input data.
         sentences = sentence_parser(contents)
-        sentences = list(map(lambda text: BaseNewsSentence(text), sentences))
+        sentences = list(map(lambda text: BaseDocumentSentence(text), sentences))
         # Documents.
-        docs.append(News(doc_id=doc_id, sentences=sentences))
+        docs.append(Document(doc_id=doc_id, sentences=sentences))
 
     return docs
diff --git a/arelight/run/infer.py b/arelight/run/infer.py
@@ -1,9 +1,9 @@
 import argparse
 from os.path import join, dirname, basename
 
+from arekit.common.docs.entities_grouping import EntitiesGroupingPipelineItem
 from arekit.common.experiment.data_type import DataType
 from arekit.common.folding.nofold import NoFolding
-from arekit.common.news.entities_grouping import EntitiesGroupingPipelineItem
 from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
 from arekit.common.text.parser import BaseTextParser
 from arekit.contrib.utils.pipelines.items.text.terms_splitter import TermsSplitterParser
@@ -87,13 +87,11 @@
         BratHtmlEmbeddingPipelineItem(brat_url="http://localhost:8001/")
     )
 
-    no_folding = NoFolding(doc_ids=list(range(len(actual_content))),
-                           supported_data_type=DataType.Test)
-
     pipeline.run(None, {
         "template_filepath": join(const.DATA_DIR, "brat_template.html"),
         "predict_fp": "{}.tsv.gz".format(backend_template) if backend_template is not None else None,
         "brat_vis_fp": "{}.html".format(backend_template) if backend_template is not None else None,
         "data_type_pipelines": {DataType.Test: data_pipeline},
-        "data_folding": no_folding
+        "data_folding": NoFolding(),
+        "doc_ids": {DataType.Test: list(range(len(actual_content)))},
     })
diff --git a/arelight/run/serialize.py b/arelight/run/serialize.py
@@ -1,13 +1,13 @@
 import argparse
 from os.path import join, dirname, basename
 
+from arekit.common.docs.entities_grouping import EntitiesGroupingPipelineItem
 from arekit.common.experiment.data_type import DataType
 from arekit.common.folding.nofold import NoFolding
 from arekit.common.labels.base import NoLabel
 from arekit.common.labels.provider.constant import ConstantLabelProvider
 from arekit.common.labels.scaler.single import SingleLabelScaler
 from arekit.common.labels.str_fmt import StringLabelsFormatter
-from arekit.common.news.entities_grouping import EntitiesGroupingPipelineItem
 from arekit.common.opinions.annot.algo.pair_based import PairBasedOpinionAnnotationAlgorithm
 from arekit.common.opinions.annot.base import BaseOpinionAnnotator
 from arekit.common.pipeline.base import BasePipeline
@@ -102,14 +102,12 @@
             samples_io=SamplesIO(target_dir=dirname(backend_template),
                                  prefix=basename(backend_template),
                                  writer=NativeCsvWriter(delimiter=',')),
-            save_labels_func=lambda data_type: data_type != DataType.Test,
-            balance_func=lambda data_type: data_type == DataType.Train)
+            save_labels_func=lambda data_type: data_type != DataType.Test)
     ])
 
-    no_folding = NoFolding(doc_ids=list(range(len(texts_from_files))), supported_data_type=DataType.Test)
-
     pipeline.run(input_data=None,
                  params_dict={
-                     "data_folding": no_folding,
+                     "data_folding": NoFolding(),
+                     "doc_ids": {DataType.Test: list(range(len(texts_from_files)))},
                      "data_type_pipelines": {DataType.Test: test_pipeline}
                  })
diff --git a/dependencies.txt b/dependencies.txt
@@ -1,6 +1,6 @@
-deeppavlov==0.11.0
-# DeepPavlov bert-dp dependencies:
-bert_dp @ git+https://github.com/deepmipt/bert.git@feat/multi_gpu
-# Install arekit
-arekit @ git+https://github.com/nicolay-r/AREkit@0.23.1-rc
-nltk
+deeppavlov==1.2.0
+transformers==4.31.0
+torch==2.0.1
+pytorch-crf==0.7.2
+arekit @ git+https://github.com/nicolay-r/AREkit@0.24.0-rc
+nltk==3.4.5
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@ def get_requirements(filenames):
 
 setup(
     name='arelight',
-    version='0.23.2',
+    version='0.24.0',
     description='About Mass-media text processing application for your '
                 'Relation Extraction task, powered by AREkit.',
     url='https://github.com/nicolay-r/ARElight',

diff --git a/test/test_bert_serialization.py b/test/test_bert_serialization.py
@@ -1,15 +1,15 @@
 import unittest
 import ru_sent_tokenize
+from arekit.common.docs.base import Document
+from arekit.common.docs.entities_grouping import EntitiesGroupingPipelineItem
+from arekit.common.docs.sentence import BaseDocumentSentence
 from ru_sent_tokenize import ru_sent_tokenize
 from os.path import dirname, join, realpath
 
 from arekit.common.experiment.data_type import DataType
 from arekit.common.folding.nofold import NoFolding
 from arekit.common.labels.base import NoLabel
 from arekit.common.labels.scaler.single import SingleLabelScaler
-from arekit.common.news.base import News
-from arekit.common.news.entities_grouping import EntitiesGroupingPipelineItem
-from arekit.common.news.sentence import BaseNewsSentence
 from arekit.common.pipeline.base import BasePipeline
 from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
 from arekit.common.text.parser import BaseTextParser
@@ -25,6 +25,7 @@
 
 from arelight.doc_ops import InMemoryDocOperations
 from arelight.pipelines.data.annot_pairs_nolabel import create_neutral_annotation_pipeline
+from arelight.pipelines.items.entities_default import TextEntitiesParser
 from arelight.pipelines.items.entities_ner_dp import DeepPavlovNERPipelineItem
 from arelight.samplers.bert import create_bert_sample_provider
 from arelight.samplers.types import BertSampleProviderTypes
@@ -54,8 +55,8 @@ def input_to_docs(texts):
         docs = []
         for doc_id, contents in enumerate(texts):
             sentences = ru_sent_tokenize(contents)
-            sentences = list(map(lambda text: BaseNewsSentence(text), sentences))
-            doc = News(doc_id=doc_id, sentences=sentences)
+            sentences = list(map(lambda text: BaseDocumentSentence(text), sentences))
+            doc = Document(doc_id=doc_id, sentences=sentences)
             docs.append(doc)
         return docs
 
@@ -70,10 +71,10 @@ def test(self):
         # Declare input texts.
         texts = [
             # Text 1.
-            """24 марта президент США Джо Байден провел переговоры с
-               лидерами стран Евросоюза в Брюсселе, вызвав внимание рынка и предположения о
-               том, что Америке удалось уговорить ЕС совместно бойкотировать российские нефть
-               и газ.  Европейский Союз крайне зависим от России в плане поставок нефти и
+            """24 марта президент [США] [Джо Байден] провел переговоры с
+               лидерами стран [Евросоюза] в [Брюсселе], вызвав внимание рынка и предположения о
+               том, что [Америке] удалось уговорить [ЕС] совместно бойкотировать российские нефть
+               и газ.  [Европейский Союз] крайне зависим от [России] в плане поставок нефти и
                газа."""
         ]
 
@@ -88,7 +89,7 @@ def test(self):
         # Declare text parser.
         text_parser = BaseTextParser(pipeline=[
             TermsSplitterParser(),
-            DeepPavlovNERPipelineItem(lambda s_obj: s_obj.ObjectType in ["ORG", "PERSON", "LOC", "GPE"]),
+            TextEntitiesParser(),
             EntitiesGroupingPipelineItem(lambda value:
                 SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
                     synonyms=synonyms, value=value))
@@ -97,9 +98,6 @@ def test(self):
         # Single label scaler.
         single_label_scaler = SingleLabelScaler(NoLabel())
 
-        # Declare folding and experiment context.
-        no_folding = NoFolding(doc_ids=list(range(len(texts))), supported_data_type=DataType.Test)
-
         # Composing labels formatter and experiment preparation.
         doc_ops = InMemoryDocOperations(docs=BertTestSerialization.input_to_docs(texts))
 
@@ -113,8 +111,7 @@ def test(self):
                 rows_provider=rows_provider,
                 storage=RowCacheStorage(),
                 samples_io=SamplesIO(target_dir=self.TEST_DATA_DIR, writer=NativeCsvWriter(delimiter=',')),
-                save_labels_func=lambda data_type: data_type != DataType.Test,
-                balance_func=lambda data_type: data_type == DataType.Train)
+                save_labels_func=lambda data_type: data_type != DataType.Test)
         ])
 
         synonyms = StemmerBasedSynonymCollection(iter_group_values_lists=[],
@@ -131,7 +128,8 @@ def test(self):
 
         pipeline.run(input_data=None,
                      params_dict={
-                         "data_folding": no_folding,
+                         "data_folding": NoFolding(),
+                         "doc_ids": {DataType.Test: list(range(len(texts)))},
                          "data_type_pipelines": {DataType.Test: test_pipeline}
                      })
 

diff --git a/update_arekit.sh b/update_arekit.sh
@@ -1,2 +1,2 @@
 pip3 uninstall arekit
-pip3 install git+https://github.com/nicolay-r/AREkit@0.23.1-rc --no-deps
+pip3 install git+https://github.com/nicolay-r/AREkit@0.24.0-rc --no-deps