Skip to content

Commit

Permalink
feat: wikit_ua in config
Browse files Browse the repository at this point in the history
  • Loading branch information
lukasellinger committed Oct 21, 2024
1 parent 1885952 commit 1635e9f
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 2 deletions.
4 changes: 4 additions & 0 deletions config.py.template
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@ HF_WRITE_TOKEN = ''
HF_READ_TOKENS = ['', '', '']

OPEN_AI_TOKEN = ''

# please follow https://foundation.wikimedia.org/wiki/Policy:User-Agent_policy
# e.g.: evaluating-factuality-word-definitions-bot (<your email>)
WIKI_USER_AGENT = ''
3 changes: 2 additions & 1 deletion fetchers/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from requests import Response
from transformers import RobertaTokenizer

from config import WIKI_USER_AGENT
from fetchers.wiktionary_parser import WiktionaryParser
from general_utils.spacy_utils import (split_into_passage_sentences,
split_into_sentences)
Expand All @@ -18,7 +19,7 @@
class Wikipedia:
"""Wrapper for wikipedia api calls."""

USER_AGENT = 'summaryBot ([email protected])'
USER_AGENT = WIKI_USER_AGENT
BASE_URL = "https://{source_lang}.{site}.org/w/api.php"

def __init__(self, source_lang: str = 'en', user_agent: str = None, use_dataset: str = ''):
Expand Down
2 changes: 1 addition & 1 deletion general_utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def get_openai_prediction(response) -> str:
return 'UNKNOWN'


def split_into_passages(text: str, tokenizer, max_length=256) -> List[str]:
def split_into_passages(text: str | List[str], tokenizer, max_length=256) -> List[str]:
"""
Splits text into passages of a specified token length using a tokenizer.
Expand Down

0 comments on commit 1635e9f

Please sign in to comment.