Skip to content

Commit

Permalink
Merge branch 'v0.25.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Jan 1, 2025
2 parents 46a3ed3 + ae494c6 commit 879b621
Show file tree
Hide file tree
Showing 42 changed files with 344 additions and 931 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ARElight 0.25.0
# ARElight 0.25.1

![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
![](https://img.shields.io/badge/AREkit-0.25.1-orange.svg)
![](https://img.shields.io/badge/AREkit-0.25.2-orange.svg)
[![](https://img.shields.io/badge/demo-0.24.0-purple.svg)](https://guardeec.github.io/arelight_demo/template.html)
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/ARElight/blob/v0.24.0/ARElight.ipynb)
[![PyPI downloads](https://img.shields.io/pypi/dm/arelight.svg)](https://pypistats.org/packages/arelight)
Expand All @@ -21,7 +21,7 @@ This repository is a part of the **ECIR-2024** demo paper:
# Installation

```bash
pip install git+https://github.com/nicolay-r/[email protected].0
pip install git+https://github.com/nicolay-r/[email protected].1
```

# GUI Interface
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ class IndexedEntity(Entity):
""" Same as the base Entity but supports indexing.
"""

def __init__(self, value, e_type, entity_id, display_value=None):
super(IndexedEntity, self).__init__(value=value, e_type=e_type, display_value=display_value)
def __init__(self, value, e_type, entity_id):
super(IndexedEntity, self).__init__(value=value, e_type=e_type)
self.__id = entity_id

@property
Expand Down
File renamed without changes.
108 changes: 108 additions & 0 deletions arelight/arekit/storages/pandas_based.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import importlib

import numpy as np

from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
from arekit.common.data.storages.base import BaseRowsStorage, logger
from arekit.common.utils import progress_bar_iter


class PandasBasedRowsStorage(BaseRowsStorage):
""" Storage Kernel functions implementation,
based on the pandas DataFrames.
"""

def __init__(self, df=None, **kwargs):
super(PandasBasedRowsStorage, self).__init__(**kwargs)
self._df = df

@property
def DataFrame(self):
# TODO. Temporary hack, however this should be removed in future.
return self._df

@staticmethod
def __create_empty(cols_with_types):
""" cols_with_types: list of pairs ("name", dtype)
"""
assert(isinstance(cols_with_types, list))
data = np.empty(0, dtype=np.dtype(cols_with_types))
pd = importlib.import_module("pandas")
return pd.DataFrame(data)

def __filter(self, column_name, value):
return self._df[self._df[column_name] == value]

@staticmethod
def __iter_rows_core(df):
for row_index, row in df.iterrows():
yield row_index, row

def __fill_with_blank_rows(self, row_id_column_name, rows_count):
assert(isinstance(row_id_column_name, str))
assert(isinstance(rows_count, int))
self._df[row_id_column_name] = list(range(rows_count))
self._df.set_index(row_id_column_name, inplace=True)

# region protected methods

def iter_column_names(self):
return iter(self._df.columns)

def iter_column_types(self):
return iter(self._df.dtypes)

def _set_row_value(self, row_ind, column, value):
self._df.at[row_ind, column] = value

def _iter_rows(self):
for row_index, row in self.__iter_rows_core(self._df):
yield row_index, row.to_dict()

def _get_rows_count(self):
return len(self._df)

# endregion

# region public methods

def fill(self, iter_rows_func, columns_provider, row_handler=None, rows_count=None, desc=""):
""" NOTE: We provide the rows counting which is required
in order to know an expected amount of rows in advace
due to the specifics of the pandas memory allocation
for the DataFrames.
The latter allows us avoid rows appending, which
may significantly affects on performance once the size
of DataFrame becomes relatively large.
"""
assert(isinstance(columns_provider, BaseColumnsProvider))

logger.info("Rows calculation process started. [Required by Pandas-Based storage kernel]")
logged_rows_it = progress_bar_iter(
iterable=iter_rows_func(True),
desc="Calculating rows count ({reason})".format(reason=desc),
unit="rows")
rows_count = sum(1 for _ in logged_rows_it)

logger.info("Filling with blank rows: {}".format(rows_count))
self.__fill_with_blank_rows(row_id_column_name=columns_provider.ROW_ID,
rows_count=rows_count)
logger.info("Completed!")

super(PandasBasedRowsStorage, self).fill(iter_rows_func=iter_rows_func,
row_handler=row_handler,
columns_provider=columns_provider,
rows_count=rows_count)

def get_row(self, row_index):
return self._df.iloc[row_index]

def init_empty(self, columns_provider):
cols_with_types = columns_provider.get_columns_list_with_types()
self._df = self.__create_empty(cols_with_types)

def free(self):
del self._df
super(PandasBasedRowsStorage, self).free()

# endregion
3 changes: 2 additions & 1 deletion arelight/data/writers/csv_pd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
from arekit.common.utils import create_dir_if_not_exists
from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
from arekit.contrib.utils.data.writers.base import BaseWriter

from arelight.arekit.storages.pandas_based import PandasBasedRowsStorage

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

Expand Down
58 changes: 0 additions & 58 deletions arelight/ner/base.py

This file was deleted.

17 changes: 0 additions & 17 deletions arelight/ner/deep_pavlov.py

This file was deleted.

21 changes: 0 additions & 21 deletions arelight/ner/obj_desc.py

This file was deleted.

8 changes: 7 additions & 1 deletion arelight/pipelines/data/annot_pairs_nolabel.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,24 @@
from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter

from arelight.arekit.indexed_entity import IndexedEntity


def create_neutral_annotation_pipeline(synonyms, dist_in_terms_bound, terms_per_context, batch_size,
doc_provider, text_pipeline, dist_in_sentences=0):

nolabel_annotator = AlgorithmBasedTextOpinionAnnotator(
is_entity_func=lambda term: isinstance(term, IndexedEntity),
value_to_group_id_func=lambda value:
SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
synonyms=synonyms, value=value),
annot_algo=PairBasedOpinionAnnotationAlgorithm(
dist_in_sents=dist_in_sentences,
dist_in_terms_bound=dist_in_terms_bound,
entity_index_func=lambda indexed_entity: indexed_entity.ID,
label_provider=ConstantLabelProvider(NoLabel())),
label_provider=ConstantLabelProvider(NoLabel()),
is_entity_func=lambda term: isinstance(term, IndexedEntity),
entity_value_func=lambda e: e.Value),
create_empty_collection_func=lambda: OpinionCollection(
opinions=[],
synonyms=synonyms,
Expand All @@ -30,6 +35,7 @@ def create_neutral_annotation_pipeline(synonyms, dist_in_terms_bound, terms_per_
entity_index_func=lambda indexed_entity: indexed_entity.ID,
pipeline_items=text_pipeline,
get_doc_by_id_func=doc_provider.by_id,
is_entity_func=lambda term: isinstance(term, IndexedEntity),
annotators=[
nolabel_annotator
],
Expand Down
97 changes: 0 additions & 97 deletions arelight/pipelines/items/entities_ner_dp.py

This file was deleted.

Loading

0 comments on commit 879b621

Please sign in to comment.