Skip to content

Commit

Permalink
#52 done
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Jan 8, 2023
1 parent c2e58f9 commit e139a53
Show file tree
Hide file tree
Showing 27 changed files with 134 additions and 1,121 deletions.
48 changes: 1 addition & 47 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ we adopt [DeepPavlov](https://github.com/deeppavlovteam/DeepPavlov) (BertOntoNo

# Dependencies

* arekit == 0.22.1
* arekit == 0.23.0
* deeppavlov == 0.11.0
* rusenttokenize
* brat-v1.3 [[github]](https://github.com/nlplab/brat)
Expand Down Expand Up @@ -52,13 +52,6 @@ service apache2 start
![](docs/demo.png)

* PCNN example, finetuned on [RuSentRel](https://github.com/nicolay-r/RuSentRel):
http://172.17.0.2/examples/demo/wui_nn.py

> **Supported Languages**: Russian
![](docs/demo_pcnn.png)

## Full
* ARElight:
```bash
Expand Down Expand Up @@ -96,29 +89,6 @@ python3.6 infer_bert.py --from-files ../data/texts-inosmi-rus/e1.txt \
<img src="docs/inference-bert-e1.png"/>
</p>

> **Supported Languages**: Russian
Using the pretrained `PCNN` model (including frames annotation):
```bash
python3.6 infer_nn.py --from-files ../data/texts-inosmi-rus/e1.txt \
--model-name pcnn \
--model-state-dir models/ \
--terms-per-context 50 \
--stemmer mystem \
--entities-parser bert-ontonotes \
--frames ruattitudes-20 \
--labels-count 3 \
--bags-per-minibatch 2 \
--model-input-type ctx \
--entity-fmt hidden-simple-eng \
--synonyms-filepath ../data/synonyms.txt \
-o output/brat_inference_output
```

<p align="center">
<img src="docs/inference-pcnn-e1.png"/>
</p>

# Serialization

> **Supported Languages**: Any
Expand All @@ -134,22 +104,6 @@ python3.6 serialize_bert.py --from-files ../data/texts-inosmi-rus/e1.txt
<img src="docs/samples-bert.png">
</p>

> **Supported Languages**: Russian by default (depends on embedding)
For the other neural networks (including embedding and other features):
```bash
python3.6 serialize_nn.py --from-files ../data/texts-inosmi-rus/e1.txt \
--entities-parser bert-ontonotes \
--stemmer mystem \
--terms-per-context 50 \
--synonyms-filepath ../data/synonyms.txt \
--frames ruattitudes-20
```

<p align="center">
<img src="docs/samples-nn.png"/>
</p>

# Papers

* [Nicolay Rusnachenko: Language Models Application in Sentiment Attitude Extraction Task (2021) [RUS]](https://nicolay-r.github.io/website/data/rusnachenko2021language.pdf)
Expand Down
15 changes: 10 additions & 5 deletions arelight/brat_backend.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from arekit.contrib.networks.input.const import FrameVariantIndices
from arekit.contrib.networks.input.rows_parser import ParsedSampleRow
from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
from tqdm import tqdm
import collections

Expand All @@ -11,8 +14,6 @@
from arekit.common.frames.variants.base import FrameVariant
from arekit.common.news.entity import DocumentEntity
from arekit.contrib.utils.processing.text.tokens import Tokens
from arekit.contrib.networks.core.input.const import FrameVariantIndices
from arekit.contrib.networks.core.input.rows_parser import ParsedSampleRow


class BratBackend(object):
Expand Down Expand Up @@ -175,6 +176,7 @@ def __to_terms(doc_data):

text_terms[e_ind] = DocumentEntity(
value=sentence_entity_values[i],
display_value=sentence_entity_values[i],
e_type=sentence_entity_types[i],
id_in_doc=e_doc_id,
group_index=None)
Expand All @@ -184,7 +186,7 @@ def __to_terms(doc_data):
if sent_data[FrameVariantIndices] is not None:
for i, f_ind in enumerate(sent_data[FrameVariantIndices]):
value = text_terms[f_ind]
text_terms[f_ind] = FrameVariant(text=value, frame_id="0")
text_terms[f_ind] = FrameVariant(terms=[value], frame_id="0")

for i, term in enumerate(text_terms):
if not isinstance(term, str):
Expand Down Expand Up @@ -346,9 +348,12 @@ def to_data(self, obj_color_types, rel_color_types, samples_data_filepath,
assert(isinstance(docs_range, tuple) or docs_range is None)
assert(isinstance(label_to_rel, dict))

samples_reader = PandasCsvReader(col_types={'frames': str})
result_reader = PandasCsvReader()

text, coll_data, doc_data = self.__to_data(
samples=BaseRowsStorage.from_tsv(samples_data_filepath, col_types={'frames': str}),
result=BaseRowsStorage.from_tsv(result_data_filepath) if result_data_filepath is not None else None,
samples=samples_reader.read(samples_data_filepath),
result=result_reader.read(result_data_filepath) if result_data_filepath is not None else None,
obj_color_types=obj_color_types,
rel_color_types=rel_color_types,
label_to_rel=label_to_rel,
Expand Down
Empty file removed arelight/network/__init__.py
Empty file.
Empty file removed arelight/network/nn/__init__.py
Empty file.
42 changes: 0 additions & 42 deletions arelight/network/nn/common.py

This file was deleted.

10 changes: 6 additions & 4 deletions arelight/pipelines/demo/infer_bert_rus.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from arekit.common.data.input.readers.tsv import TsvReader
from arekit.common.data.input.writers.tsv import TsvWriter
from arekit.common.experiment.data_type import DataType
from arekit.common.labels.scaler.base import BaseLabelScaler
from arekit.common.pipeline.base import BasePipeline
from arekit.contrib.networks.core.predict.tsv_writer import TsvPredictWriter
from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
from arekit.contrib.utils.data.writers.csv_pd import PandasCsvWriter
from arekit.contrib.utils.io_utils.samples import SamplesIO
from arekit.contrib.utils.pipelines.items.sampling.bert import BertExperimentInputSerializerPipelineItem

from arelight.pipelines.demo.labels.base import PositiveLabel, NegativeLabel
from arelight.pipelines.items.backend_brat_json import BratBackendContentsPipelineItem
from arelight.pipelines.items.inference_bert import BertInferencePipelineItem
from arelight.predict_writer_csv import TsvPredictWriter
from arelight.samplers.bert import create_bert_sample_provider
from arelight.samplers.types import SampleFormattersService

Expand All @@ -28,7 +28,9 @@ def demo_infer_texts_bert_pipeline(texts_count,
assert(isinstance(output_dir, str))
assert(isinstance(labels_scaler, BaseLabelScaler))

samples_io = SamplesIO(target_dir=output_dir, reader=TsvReader(), writer=TsvWriter(write_header=True))
samples_io = SamplesIO(target_dir=output_dir,
reader=PandasCsvReader(),
writer=PandasCsvWriter(write_header=True))

pipeline = BasePipeline(pipeline=[

Expand Down
113 changes: 0 additions & 113 deletions arelight/pipelines/demo/infer_nn_rus.py

This file was deleted.

11 changes: 7 additions & 4 deletions arelight/pipelines/items/inference_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@

from arekit.common.data import const
from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
from arekit.common.data.storages.base import BaseRowsStorage
from arekit.common.experiment.data_type import DataType
from arekit.common.labels.scaler.base import BaseLabelScaler
from arekit.common.pipeline.context import PipelineContext
from arekit.common.pipeline.items.base import BasePipelineItem
from arekit.contrib.bert.input.providers.text_pair import PairTextProvider
from arekit.contrib.networks.core.predict.base_writer import BasePredictWriter
from arekit.contrib.networks.core.predict.provider import BasePredictProvider
from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
from arekit.contrib.utils.io_utils.samples import SamplesIO

from arelight.predict_provider import BasePredictProvider
from arelight.predict_writer import BasePredictWriter

from deeppavlov.models.bert import bert_classifier
from deeppavlov.models.preprocessors.bert_preprocessor import BertPreprocessor

Expand Down Expand Up @@ -46,12 +48,13 @@ def __init__(self, bert_config_file, model_checkpoint_path, vocab_filepath, samp
self.__predict_provider = BasePredictProvider()
self.__samples_io = samples_io
self.__batch_size = batch_size
self.__samples_reader = PandasCsvReader()

def apply_core(self, input_data, pipeline_ctx):
assert(isinstance(pipeline_ctx, PipelineContext))

def __iter_predict_result():
samples = BaseRowsStorage.from_tsv(samples_filepath)
samples = self.__samples_reader.read(samples_filepath)

used_row_ids = set()

Expand Down
Loading

0 comments on commit e139a53

Please sign in to comment.