diff --git a/README.md b/README.md index e72be44..2a93243 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# ARElight 0.23.2 +# ARElight 0.24.0 -![](https://img.shields.io/badge/Python-3.6.9-brightgreen.svg) +![](https://img.shields.io/badge/Python-3.9-brightgreen.svg) ![](https://img.shields.io/badge/AREkit-0.23.1-orange.svg) ### :point_right: [DEMO](https://github.com/nicolay-r/ARElight/tree/v0.22.0#installation) :point_left: @@ -21,7 +21,7 @@ we adopt [DeepPavlov](https://github.com/deeppavlovteam/DeepPavlov) (BertOntoNo 1. Main library installation ```bash -pip install git+https://github.com/nicolay-r/arelight@v0.23.1 +pip install git+https://github.com/nicolay-r/arelight@v0.24.0 ``` 2. (Optional) BRAT: [Download](https://github.com/nlplab/brat/releases/tag/v1.3_Crunchy_Frog) diff --git a/arelight/brat_backend.py b/arelight/brat_backend.py index 3bffb75..72db055 100644 --- a/arelight/brat_backend.py +++ b/arelight/brat_backend.py @@ -1,3 +1,4 @@ +from arekit.common.docs.entity import DocumentEntity from arekit.contrib.networks.input.const import FrameVariantIndices from arekit.contrib.networks.input.rows_parser import ParsedSampleRow from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader @@ -11,7 +12,6 @@ from arekit.common.data.storages.base import BaseRowsStorage from arekit.common.entities.base import Entity from arekit.common.frames.variants.base import FrameVariant -from arekit.common.news.entity import DocumentEntity from arekit.contrib.utils.processing.text.tokens import Tokens @@ -114,7 +114,7 @@ def __iter_sample_labels(samples, label_to_rel): assert(isinstance(samples, BaseRowsStorage)) for row_ind, row in samples: - str_label = str(row[const.LABEL]) if const.LABEL in row else None + str_label = str(row[const.LABEL_UINT]) if const.LABEL_UINT in row else None label = label_to_rel[str_label] if str_label in label_to_rel else None yield row_ind, label diff --git a/arelight/doc_ops.py b/arelight/doc_ops.py index 25d1968..d84a94f 100644 --- a/arelight/doc_ops.py +++ b/arelight/doc_ops.py @@ -1,7 +1,7 @@ -from arekit.common.experiment.api.ops_doc import DocumentOperations +from arekit.common.data.doc_provider import DocumentProvider -class InMemoryDocOperations(DocumentOperations): +class InMemoryDocOperations(DocumentProvider): def __init__(self, docs=None): assert(isinstance(docs, list) or docs is None) diff --git a/arelight/pipelines/items/entities_default.py b/arelight/pipelines/items/entities_default.py index 1490506..0e42b11 100644 --- a/arelight/pipelines/items/entities_default.py +++ b/arelight/pipelines/items/entities_default.py @@ -13,7 +13,7 @@ def __process_word(word): # If this is a special word which is related to the [entity] mention. if word[0] == "[" and word[-1] == "]": - entity = Entity(value=word[1:-1], e_type=None) + entity = Entity(value=word[1:-1], e_type="UNDEFINED") return entity return word diff --git a/arelight/pipelines/items/entities_ner_dp.py b/arelight/pipelines/items/entities_ner_dp.py index ec818fd..b9f72b1 100644 --- a/arelight/pipelines/items/entities_ner_dp.py +++ b/arelight/pipelines/items/entities_ner_dp.py @@ -1,6 +1,6 @@ from arekit.common.bound import Bound +from arekit.common.docs.objects_parser import SentenceObjectsParserPipelineItem from arekit.common.entities.base import Entity -from arekit.common.news.objects_parser import SentenceObjectsParserPipelineItem from arekit.common.text.partitioning.terms import TermsPartitioning from arelight.ner.deep_pavlov import DeepPavlovNER diff --git a/arelight/pipelines/items/train_bert.py b/arelight/pipelines/items/train_bert.py index 9c60014..d7b1348 100644 --- a/arelight/pipelines/items/train_bert.py +++ b/arelight/pipelines/items/train_bert.py @@ -47,19 +47,19 @@ def apply_core(self, input_data, pipeline_ctx): def __iter_batches(s, batch_size): assert(isinstance(s, BaseRowsStorage)) - data = {"text_a": [], "text_b": [], "label": []} + data = {"text_a": [], "text_b": [], "label_uint": []} # NOTE: it is important to iter shuffled data! for row_ind, row in s.iter_shuffled(): data["text_a"].append(row['text_a']) data["text_b"].append(row['text_b']) - data["label"].append(row[const.LABEL]) + data["label_uint"].append(row[const.LABEL_UINT]) for i in range(0, len(data["text_a"]), batch_size): texts_a = data["text_a"][i:i + batch_size] texts_b = data["text_b"][i:i + batch_size] - labels = data["label"][i:i + batch_size] + labels = data["label_uint"][i:i + batch_size] batch_features = self.__proc(texts_a=texts_a, texts_b=texts_b) diff --git a/arelight/pipelines/items/utils.py b/arelight/pipelines/items/utils.py index a150e2e..6716b52 100644 --- a/arelight/pipelines/items/utils.py +++ b/arelight/pipelines/items/utils.py @@ -1,5 +1,5 @@ -from arekit.common.news.base import News -from arekit.common.news.sentence import BaseNewsSentence +from arekit.common.docs.base import Document +from arekit.common.docs.sentence import BaseDocumentSentence def input_to_docs(input_data, sentence_parser): @@ -15,8 +15,8 @@ def input_to_docs(input_data, sentence_parser): for doc_id, contents in enumerate(input_data): # setup input data. sentences = sentence_parser(contents) - sentences = list(map(lambda text: BaseNewsSentence(text), sentences)) + sentences = list(map(lambda text: BaseDocumentSentence(text), sentences)) # Documents. - docs.append(News(doc_id=doc_id, sentences=sentences)) + docs.append(Document(doc_id=doc_id, sentences=sentences)) return docs diff --git a/arelight/run/infer.py b/arelight/run/infer.py index cdef5fc..f736f55 100644 --- a/arelight/run/infer.py +++ b/arelight/run/infer.py @@ -1,9 +1,9 @@ import argparse from os.path import join, dirname, basename +from arekit.common.docs.entities_grouping import EntitiesGroupingPipelineItem from arekit.common.experiment.data_type import DataType from arekit.common.folding.nofold import NoFolding -from arekit.common.news.entities_grouping import EntitiesGroupingPipelineItem from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders from arekit.common.text.parser import BaseTextParser from arekit.contrib.utils.pipelines.items.text.terms_splitter import TermsSplitterParser @@ -87,13 +87,11 @@ BratHtmlEmbeddingPipelineItem(brat_url="http://localhost:8001/") ) - no_folding = NoFolding(doc_ids=list(range(len(actual_content))), - supported_data_type=DataType.Test) - pipeline.run(None, { "template_filepath": join(const.DATA_DIR, "brat_template.html"), "predict_fp": "{}.tsv.gz".format(backend_template) if backend_template is not None else None, "brat_vis_fp": "{}.html".format(backend_template) if backend_template is not None else None, "data_type_pipelines": {DataType.Test: data_pipeline}, - "data_folding": no_folding + "data_folding": NoFolding(), + "doc_ids": {DataType.Test: list(range(len(actual_content)))}, }) diff --git a/arelight/run/serialize.py b/arelight/run/serialize.py index d188e3e..1cbbfa1 100644 --- a/arelight/run/serialize.py +++ b/arelight/run/serialize.py @@ -1,13 +1,13 @@ import argparse from os.path import join, dirname, basename +from arekit.common.docs.entities_grouping import EntitiesGroupingPipelineItem from arekit.common.experiment.data_type import DataType from arekit.common.folding.nofold import NoFolding from arekit.common.labels.base import NoLabel from arekit.common.labels.provider.constant import ConstantLabelProvider from arekit.common.labels.scaler.single import SingleLabelScaler from arekit.common.labels.str_fmt import StringLabelsFormatter -from arekit.common.news.entities_grouping import EntitiesGroupingPipelineItem from arekit.common.opinions.annot.algo.pair_based import PairBasedOpinionAnnotationAlgorithm from arekit.common.opinions.annot.base import BaseOpinionAnnotator from arekit.common.pipeline.base import BasePipeline @@ -102,14 +102,12 @@ samples_io=SamplesIO(target_dir=dirname(backend_template), prefix=basename(backend_template), writer=NativeCsvWriter(delimiter=',')), - save_labels_func=lambda data_type: data_type != DataType.Test, - balance_func=lambda data_type: data_type == DataType.Train) + save_labels_func=lambda data_type: data_type != DataType.Test) ]) - no_folding = NoFolding(doc_ids=list(range(len(texts_from_files))), supported_data_type=DataType.Test) - pipeline.run(input_data=None, params_dict={ - "data_folding": no_folding, + "data_folding": NoFolding(), + "doc_ids": {DataType.Test: list(range(len(texts_from_files)))}, "data_type_pipelines": {DataType.Test: test_pipeline} }) diff --git a/dependencies.txt b/dependencies.txt index d7cb946..c6b90da 100644 --- a/dependencies.txt +++ b/dependencies.txt @@ -1,6 +1,6 @@ -deeppavlov==0.11.0 -# DeepPavlov bert-dp dependencies: -bert_dp @ git+https://github.com/deepmipt/bert.git@feat/multi_gpu -# Install arekit -arekit @ git+https://github.com/nicolay-r/AREkit@0.23.1-rc -nltk +deeppavlov==1.2.0 +transformers==4.31.0 +torch==2.0.1 +pytorch-crf==0.7.2 +arekit @ git+https://github.com/nicolay-r/AREkit@0.24.0-rc +nltk==3.4.5 diff --git a/setup.py b/setup.py index 1189807..49bc1a4 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ def get_requirements(filenames): setup( name='arelight', - version='0.23.2', + version='0.24.0', description='About Mass-media text processing application for your ' 'Relation Extraction task, powered by AREkit.', url='https://github.com/nicolay-r/ARElight', diff --git a/test/test_bert_serialization.py b/test/test_bert_serialization.py index 5ba111c..904edf4 100644 --- a/test/test_bert_serialization.py +++ b/test/test_bert_serialization.py @@ -1,5 +1,8 @@ import unittest import ru_sent_tokenize +from arekit.common.docs.base import Document +from arekit.common.docs.entities_grouping import EntitiesGroupingPipelineItem +from arekit.common.docs.sentence import BaseDocumentSentence from ru_sent_tokenize import ru_sent_tokenize from os.path import dirname, join, realpath @@ -7,9 +10,6 @@ from arekit.common.folding.nofold import NoFolding from arekit.common.labels.base import NoLabel from arekit.common.labels.scaler.single import SingleLabelScaler -from arekit.common.news.base import News -from arekit.common.news.entities_grouping import EntitiesGroupingPipelineItem -from arekit.common.news.sentence import BaseNewsSentence from arekit.common.pipeline.base import BasePipeline from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders from arekit.common.text.parser import BaseTextParser @@ -25,6 +25,7 @@ from arelight.doc_ops import InMemoryDocOperations from arelight.pipelines.data.annot_pairs_nolabel import create_neutral_annotation_pipeline +from arelight.pipelines.items.entities_default import TextEntitiesParser from arelight.pipelines.items.entities_ner_dp import DeepPavlovNERPipelineItem from arelight.samplers.bert import create_bert_sample_provider from arelight.samplers.types import BertSampleProviderTypes @@ -54,8 +55,8 @@ def input_to_docs(texts): docs = [] for doc_id, contents in enumerate(texts): sentences = ru_sent_tokenize(contents) - sentences = list(map(lambda text: BaseNewsSentence(text), sentences)) - doc = News(doc_id=doc_id, sentences=sentences) + sentences = list(map(lambda text: BaseDocumentSentence(text), sentences)) + doc = Document(doc_id=doc_id, sentences=sentences) docs.append(doc) return docs @@ -70,10 +71,10 @@ def test(self): # Declare input texts. texts = [ # Text 1. - """24 марта президент США Джо Байден провел переговоры с - лидерами стран Евросоюза в Брюсселе, вызвав внимание рынка и предположения о - том, что Америке удалось уговорить ЕС совместно бойкотировать российские нефть - и газ. Европейский Союз крайне зависим от России в плане поставок нефти и + """24 марта президент [США] [Джо Байден] провел переговоры с + лидерами стран [Евросоюза] в [Брюсселе], вызвав внимание рынка и предположения о + том, что [Америке] удалось уговорить [ЕС] совместно бойкотировать российские нефть + и газ. [Европейский Союз] крайне зависим от [России] в плане поставок нефти и газа.""" ] @@ -88,7 +89,7 @@ def test(self): # Declare text parser. text_parser = BaseTextParser(pipeline=[ TermsSplitterParser(), - DeepPavlovNERPipelineItem(lambda s_obj: s_obj.ObjectType in ["ORG", "PERSON", "LOC", "GPE"]), + TextEntitiesParser(), EntitiesGroupingPipelineItem(lambda value: SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value( synonyms=synonyms, value=value)) @@ -97,9 +98,6 @@ def test(self): # Single label scaler. single_label_scaler = SingleLabelScaler(NoLabel()) - # Declare folding and experiment context. - no_folding = NoFolding(doc_ids=list(range(len(texts))), supported_data_type=DataType.Test) - # Composing labels formatter and experiment preparation. doc_ops = InMemoryDocOperations(docs=BertTestSerialization.input_to_docs(texts)) @@ -113,8 +111,7 @@ def test(self): rows_provider=rows_provider, storage=RowCacheStorage(), samples_io=SamplesIO(target_dir=self.TEST_DATA_DIR, writer=NativeCsvWriter(delimiter=',')), - save_labels_func=lambda data_type: data_type != DataType.Test, - balance_func=lambda data_type: data_type == DataType.Train) + save_labels_func=lambda data_type: data_type != DataType.Test) ]) synonyms = StemmerBasedSynonymCollection(iter_group_values_lists=[], @@ -131,7 +128,8 @@ def test(self): pipeline.run(input_data=None, params_dict={ - "data_folding": no_folding, + "data_folding": NoFolding(), + "doc_ids": {DataType.Test: list(range(len(texts)))}, "data_type_pipelines": {DataType.Test: test_pipeline} }) diff --git a/update_arekit.sh b/update_arekit.sh index a792842..c8730fb 100755 --- a/update_arekit.sh +++ b/update_arekit.sh @@ -1,2 +1,2 @@ pip3 uninstall arekit -pip3 install git+https://github.com/nicolay-r/AREkit@0.23.1-rc --no-deps +pip3 install git+https://github.com/nicolay-r/AREkit@0.24.0-rc --no-deps