Merge branch 'v0.25.1'

nicolay-r · Jan 1, 2025 · 879b621 · 879b621
2 parents 46a3ed3 + ae494c6
commit 879b621
Show file tree

Hide file tree

Showing 42 changed files with 344 additions and 931 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
-# ARElight 0.25.0
+# ARElight 0.25.1
 
 ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
-![](https://img.shields.io/badge/AREkit-0.25.1-orange.svg)
+![](https://img.shields.io/badge/AREkit-0.25.2-orange.svg)
 [![](https://img.shields.io/badge/demo-0.24.0-purple.svg)](https://guardeec.github.io/arelight_demo/template.html)
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/ARElight/blob/v0.24.0/ARElight.ipynb)
 [![PyPI downloads](https://img.shields.io/pypi/dm/arelight.svg)](https://pypistats.org/packages/arelight)
@@ -21,7 +21,7 @@ This repository is a part of the **ECIR-2024** demo paper:
 # Installation
 
 ```bash
-pip install git+https://github.com/nicolay-r/[email protected].0
+pip install git+https://github.com/nicolay-r/[email protected].1
 ```
 
 # GUI Interface 

diff --git a/arelight/pipelines/items/entity.py → arelight/arekit/indexed_entity.py b/arelight/pipelines/items/entity.py → arelight/arekit/indexed_entity.py
@@ -5,8 +5,8 @@ class IndexedEntity(Entity):
     """ Same as the base Entity but supports indexing.
     """
 
-    def __init__(self, value, e_type, entity_id, display_value=None):
-        super(IndexedEntity, self).__init__(value=value, e_type=e_type, display_value=display_value)
+    def __init__(self, value, e_type, entity_id):
+        super(IndexedEntity, self).__init__(value=value, e_type=e_type)
         self.__id = entity_id
 
     @property

diff --git a/arelight/ner/__init__.py → arelight/arekit/storages/__init__.py b/arelight/ner/__init__.py → arelight/arekit/storages/__init__.py
diff --git a/arelight/arekit/storages/pandas_based.py b/arelight/arekit/storages/pandas_based.py
@@ -0,0 +1,108 @@
+import importlib
+
+import numpy as np
+
+from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
+from arekit.common.data.storages.base import BaseRowsStorage, logger
+from arekit.common.utils import progress_bar_iter
+
+
+class PandasBasedRowsStorage(BaseRowsStorage):
+    """ Storage Kernel functions implementation,
+        based on the pandas DataFrames.
+    """
+
+    def __init__(self, df=None, **kwargs):
+        super(PandasBasedRowsStorage, self).__init__(**kwargs)
+        self._df = df
+
+    @property
+    def DataFrame(self):
+        # TODO. Temporary hack, however this should be removed in future.
+        return self._df
+
+    @staticmethod
+    def __create_empty(cols_with_types):
+        """ cols_with_types: list of pairs ("name", dtype)
+        """
+        assert(isinstance(cols_with_types, list))
+        data = np.empty(0, dtype=np.dtype(cols_with_types))
+        pd = importlib.import_module("pandas")
+        return pd.DataFrame(data)
+
+    def __filter(self, column_name, value):
+        return self._df[self._df[column_name] == value]
+
+    @staticmethod
+    def __iter_rows_core(df):
+        for row_index, row in df.iterrows():
+            yield row_index, row
+
+    def __fill_with_blank_rows(self, row_id_column_name, rows_count):
+        assert(isinstance(row_id_column_name, str))
+        assert(isinstance(rows_count, int))
+        self._df[row_id_column_name] = list(range(rows_count))
+        self._df.set_index(row_id_column_name, inplace=True)
+
+    # region protected methods
+
+    def iter_column_names(self):
+        return iter(self._df.columns)
+
+    def iter_column_types(self):
+        return iter(self._df.dtypes)
+
+    def _set_row_value(self, row_ind, column, value):
+        self._df.at[row_ind, column] = value
+
+    def _iter_rows(self):
+        for row_index, row in self.__iter_rows_core(self._df):
+            yield row_index, row.to_dict()
+
+    def _get_rows_count(self):
+        return len(self._df)
+
+    # endregion
+
+    # region public methods
+
+    def fill(self, iter_rows_func, columns_provider, row_handler=None, rows_count=None, desc=""):
+        """ NOTE: We provide the rows counting which is required
+            in order to know an expected amount of rows in advace
+            due to the specifics of the pandas memory allocation
+            for the DataFrames.
+            The latter allows us avoid rows appending, which
+            may significantly affects on performance once the size
+            of DataFrame becomes relatively large.
+        """
+        assert(isinstance(columns_provider, BaseColumnsProvider))
+
+        logger.info("Rows calculation process started. [Required by Pandas-Based storage kernel]")
+        logged_rows_it = progress_bar_iter(
+            iterable=iter_rows_func(True),
+            desc="Calculating rows count ({reason})".format(reason=desc),
+            unit="rows")
+        rows_count = sum(1 for _ in logged_rows_it)
+
+        logger.info("Filling with blank rows: {}".format(rows_count))
+        self.__fill_with_blank_rows(row_id_column_name=columns_provider.ROW_ID,
+                                    rows_count=rows_count)
+        logger.info("Completed!")
+
+        super(PandasBasedRowsStorage, self).fill(iter_rows_func=iter_rows_func,
+                                                 row_handler=row_handler,
+                                                 columns_provider=columns_provider,
+                                                 rows_count=rows_count)
+
+    def get_row(self, row_index):
+        return self._df.iloc[row_index]
+
+    def init_empty(self, columns_provider):
+        cols_with_types = columns_provider.get_columns_list_with_types()
+        self._df = self.__create_empty(cols_with_types)
+
+    def free(self):
+        del self._df
+        super(PandasBasedRowsStorage, self).free()
+
+    # endregion
diff --git a/arelight/data/writers/csv_pd.py b/arelight/data/writers/csv_pd.py
@@ -2,9 +2,10 @@
 
 from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
 from arekit.common.utils import create_dir_if_not_exists
-from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
 from arekit.contrib.utils.data.writers.base import BaseWriter
 
+from arelight.arekit.storages.pandas_based import PandasBasedRowsStorage
+
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 

diff --git a/arelight/ner/base.py b/arelight/ner/base.py
diff --git a/arelight/ner/deep_pavlov.py b/arelight/ner/deep_pavlov.py
diff --git a/arelight/ner/obj_desc.py b/arelight/ner/obj_desc.py
diff --git a/arelight/pipelines/data/annot_pairs_nolabel.py b/arelight/pipelines/data/annot_pairs_nolabel.py
@@ -7,19 +7,24 @@
 from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
 from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
 
+from arelight.arekit.indexed_entity import IndexedEntity
+
 
 def create_neutral_annotation_pipeline(synonyms, dist_in_terms_bound, terms_per_context, batch_size,
                                        doc_provider, text_pipeline, dist_in_sentences=0):
 
     nolabel_annotator = AlgorithmBasedTextOpinionAnnotator(
+        is_entity_func=lambda term: isinstance(term, IndexedEntity),
         value_to_group_id_func=lambda value:
             SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
                 synonyms=synonyms, value=value),
         annot_algo=PairBasedOpinionAnnotationAlgorithm(
             dist_in_sents=dist_in_sentences,
             dist_in_terms_bound=dist_in_terms_bound,
             entity_index_func=lambda indexed_entity: indexed_entity.ID,
-            label_provider=ConstantLabelProvider(NoLabel())),
+            label_provider=ConstantLabelProvider(NoLabel()),
+            is_entity_func=lambda term: isinstance(term, IndexedEntity),
+            entity_value_func=lambda e: e.Value),
         create_empty_collection_func=lambda: OpinionCollection(
             opinions=[],
             synonyms=synonyms,
@@ -30,6 +35,7 @@ def create_neutral_annotation_pipeline(synonyms, dist_in_terms_bound, terms_per_
         entity_index_func=lambda indexed_entity: indexed_entity.ID,
         pipeline_items=text_pipeline,
         get_doc_by_id_func=doc_provider.by_id,
+        is_entity_func=lambda term: isinstance(term, IndexedEntity),
         annotators=[
             nolabel_annotator
         ],

diff --git a/arelight/pipelines/items/entities_ner_dp.py b/arelight/pipelines/items/entities_ner_dp.py