diff --git a/docs/.buildinfo b/docs/.buildinfo index 756f522..616a937 100644 --- a/docs/.buildinfo +++ b/docs/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 1b2c69900a20a5e6d404547532dec06c +config: 76d7be4743f134d1ecb28043ff0666d5 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_modules/index.html b/docs/_modules/index.html index 76d757a..91b8010 100644 --- a/docs/_modules/index.html +++ b/docs/_modules/index.html @@ -317,7 +317,7 @@

All modules for which code is available

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/_modules/pycantonese/corpus.html b/docs/_modules/pycantonese/corpus.html index ea43f17..b35d597 100644 --- a/docs/_modules/pycantonese/corpus.html +++ b/docs/_modules/pycantonese/corpus.html @@ -703,7 +703,7 @@

Source code for pycantonese.corpus

   

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/_modules/pycantonese/jyutping/characters.html b/docs/_modules/pycantonese/jyutping/characters.html index 1a97900..d5d07d2 100644 --- a/docs/_modules/pycantonese/jyutping/characters.html +++ b/docs/_modules/pycantonese/jyutping/characters.html @@ -438,7 +438,7 @@

Source code for pycantonese.jyutping.characters

<

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/_modules/pycantonese/jyutping/parse_jyutping.html b/docs/_modules/pycantonese/jyutping/parse_jyutping.html index ee38da4..b23191d 100644 --- a/docs/_modules/pycantonese/jyutping/parse_jyutping.html +++ b/docs/_modules/pycantonese/jyutping/parse_jyutping.html @@ -498,7 +498,7 @@

Source code for pycantonese.jyutping.parse_jyutping

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/_modules/pycantonese/jyutping/tipa.html b/docs/_modules/pycantonese/jyutping/tipa.html index 6ec32a3..851b3c0 100644 --- a/docs/_modules/pycantonese/jyutping/tipa.html +++ b/docs/_modules/pycantonese/jyutping/tipa.html @@ -447,7 +447,7 @@

Source code for pycantonese.jyutping.tipa

   

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/_modules/pycantonese/jyutping/yale.html b/docs/_modules/pycantonese/jyutping/yale.html index 1d5fa37..5aae92b 100644 --- a/docs/_modules/pycantonese/jyutping/yale.html +++ b/docs/_modules/pycantonese/jyutping/yale.html @@ -625,7 +625,7 @@

Source code for pycantonese.jyutping.yale

   

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/_modules/pycantonese/pos_tagging/hkcancor_to_ud.html b/docs/_modules/pycantonese/pos_tagging/hkcancor_to_ud.html index 23c23e1..1061e7b 100644 --- a/docs/_modules/pycantonese/pos_tagging/hkcancor_to_ud.html +++ b/docs/_modules/pycantonese/pos_tagging/hkcancor_to_ud.html @@ -492,7 +492,7 @@

Source code for pycantonese.pos_tagging.hkcancor_to_ud

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/_modules/pycantonese/pos_tagging/tagger.html b/docs/_modules/pycantonese/pos_tagging/tagger.html index 75608cf..bc74c88 100644 --- a/docs/_modules/pycantonese/pos_tagging/tagger.html +++ b/docs/_modules/pycantonese/pos_tagging/tagger.html @@ -660,7 +660,7 @@

Source code for pycantonese.pos_tagging.tagger

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/_modules/pycantonese/stop_words.html b/docs/_modules/pycantonese/stop_words.html index bd6bd51..7b5a136 100644 --- a/docs/_modules/pycantonese/stop_words.html +++ b/docs/_modules/pycantonese/stop_words.html @@ -465,7 +465,7 @@

Source code for pycantonese.stop_words

   

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/_modules/pycantonese/word_segmentation.html b/docs/_modules/pycantonese/word_segmentation.html index 10353fb..8502916 100644 --- a/docs/_modules/pycantonese/word_segmentation.html +++ b/docs/_modules/pycantonese/word_segmentation.html @@ -426,7 +426,7 @@

Source code for pycantonese.word_segmentation

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/_modules/pylangacq/chat.html b/docs/_modules/pylangacq/chat.html index 61c7b44..9554e0c 100644 --- a/docs/_modules/pylangacq/chat.html +++ b/docs/_modules/pylangacq/chat.html @@ -322,6 +322,7 @@

Source code for pylangacq.chat

 
 from pylangacq.measures import _CLITIC, _get_ipsyn, _get_mlum, _get_mluw, _get_ttr
 from pylangacq.objects import Gra, Token, Utterance
+from pylangacq._clean_utterance import _clean_utterance
 
 
 _ENCODING = "utf-8"
@@ -1651,252 +1652,6 @@ 

Source code for pylangacq.chat

         )
 
 
-def _clean_utterance(utterance, phon=False):
-    """Filter away the CHAT-style annotations in ``utterance``.
-
-    Parameters
-    ----------
-    utterance : str
-        The utterance as a str
-    phon : bool, optional
-        whether we are handling PhonBank data; defaults to ``False``.
-        If ``True``, words like "xxx" and "yyy" won't be removed.
-
-    Returns
-    -------
-    str
-    """
-    # Function tested with the following CHILDES datasets:
-    # 1) Brent, Brown, HSLLD, Kuczaj, MacWhinney, Valian in Eng-NA-MOR
-    # 2) YipMatthews in Biling
-    # 3) LeeWongLeung in EastAsian/Cantonese
-    # 4) CromptonPater, Goad, Inkelas, and Providence in PhonBank English
-
-    # *** At the end of each step, apply remove_extra_spaces(). ***
-
-    # Step 1: Remove unwanted scope elements (only the very certain cases)
-    # [= whatever] for explanations
-    # [x how_many_times] for collapses
-    # [+ whatever] for actions etc
-    # [* whatever] for error coding
-    # [=? whatever] for uncertain transcriptions
-    # [=! whatever] for actions etc
-    # [% whatever] for random noises?
-    # [- language_name] for using a non-dominant language
-    # [^ whatever] for complex local events
-    # whatever for audio/video time stamps? the  character is 0x15
-    # [<] and [>] for overlapping, including [<1], [>2] etc with numbers
-    # (2.), (3.5) etc for pauses
-    # [%act: whatever] for actions etc
-
-    # [?] for best guess
-    # ‹ and › used in conjunction with [?]
-    # [!] for stressing
-
-    # "[*] [/" replaced by "[/"
-    # "] [*]" replaced by "]"
-
-    # print('utterance:', utterance, type(utterance))
-
-    utterance = re.sub(r"\[= [^\[]+?\]", "", utterance)
-    utterance = re.sub(r"\[x \d+?\]", "", utterance)
-    utterance = re.sub(r"\[\+ [^\[]+?\]", "", utterance)
-    utterance = re.sub(r"\[\* [^\[]+?\]", "", utterance)
-    utterance = re.sub(r"\[=\? [^\[]+?\]", "", utterance)
-    utterance = re.sub(r"\[=! [^\[]+?\]", "", utterance)
-    utterance = re.sub(r"\[% [^\[]+?\]", "", utterance)
-    utterance = re.sub(r"\[- [^\[]+?\]", "", utterance)
-    utterance = re.sub(r"\[\^ [^\[]+?\]", "", utterance)
-    utterance = re.sub(r"[^]+?", "", utterance)
-    utterance = re.sub(r"\[<\d?\]", "", utterance)
-    utterance = re.sub(r"\[>\d?\]", "", utterance)
-    utterance = re.sub(r"\(\d+?\.?\d*?\)", "", utterance)
-    utterance = re.sub(r"\[%act: [^\[]+?\]", "", utterance)
-
-    utterance = re.sub(r"\[\?\]", "", utterance)
-    utterance = re.sub(r"\[\!\]", "", utterance)
-    utterance = re.sub(r"‹", "", utterance)
-    utterance = re.sub(r"›", "", utterance)
-
-    utterance = re.sub(r"\[\*\] \[/", "[/", utterance)
-    utterance = re.sub(r"\] \[\*\]", "]", utterance)
-
-    utterance = _remove_extra_spaces(utterance)
-    # print('step 1:', utterance)
-
-    # Step 2: Pad elements with spaces to avoid human transcription errors
-    # If utterance has these delimiters: [ ]
-    # then pad them with extra spaces to avoid errors in transcriptions
-    # like "movement[?]" (--> "movement [?]")
-    #
-    # If utterance has:
-    #     < > (left and right angle brackets), excluding "+<" (lazy overlap)
-    #     “ (beginning quote)
-    #     ” (ending quote)
-    #     , (comma)
-    #     ? (question mark)
-    #     . (period) <-- commented out at the moment
-    #     (.) (short pause)
-    # then pad them with extra spaces.
-
-    utterance = re.sub(r"<", " <", utterance)
-    utterance = re.sub(r"\+ <", "+<", utterance)
-    utterance = re.sub(r">", "> ", utterance)
-    utterance = re.sub(r"\[", " [", utterance)
-    utterance = re.sub(r"\]", "] ", utterance)
-    utterance = re.sub(r"“", " “ ", utterance)
-    utterance = re.sub(r"”", " ” ", utterance)
-    utterance = re.sub(r",", " , ", utterance)  # works together with next line
-    utterance = re.sub(r"\+ ,", "+,", utterance)
-    utterance = re.sub(r"[^\[\./!]\?", " ? ", utterance)
-    # utterance = re.sub('[^\(\[\.\+]\.', ' . ', utterance)
-    utterance = re.sub(r"\(\.\)", " (.) ", utterance)
-    utterance = _remove_extra_spaces(utterance)
-    # print('step 2:', utterance)
-
-    # Step 3:
-    # Handle [/], [//], [///], [/?] for repetitions/reformulation
-    #        [: xx] or [:: xx] for errors
-    #
-    # Discard "xx [/]", "<xx yy> [/]", "xx [//]", "<xx yy> [//]".
-    # For "zz [: xx]" or "<yy zz> [:: xx]", keep "xx" and discard the rest.
-    #
-    # Strategies:
-    # 1. Get all matching index pairs for angle brackets < and >.
-    # 2. Delete the unwanted material inside and including these brackets
-    #    plus their signaling annotations (= "[:", "[::", "[/]", "[//]").
-    # 3. Delete the unwanted words on the left of the signaling annotations.
-
-    angle_brackets_l2r_pairs = {}  # left-to-right
-    for index_ in _find_indices(utterance, "<"):
-        counter = 1
-        for i in range(index_ + 1, len(utterance)):
-            if utterance[i] == "<":
-                counter += 1
-            elif utterance[i] == ">":
-                counter -= 1
-
-            if counter == 0:
-                angle_brackets_l2r_pairs[index_] = i
-                break
-    angle_brackets_r2l_pairs = {v: k for k, v in angle_brackets_l2r_pairs.items()}
-
-    index_pairs = []  # characters bounded by index pairs to be removed
-
-    # remove ' [///]'
-    triple_slash_right_indices = _find_indices(utterance, r"> \[///\]")
-    index_pairs += [(begin + 1, begin + 6) for begin in triple_slash_right_indices]
-
-    # remove ' [//]'
-    double_overlap_right_indices = _find_indices(utterance, r"> \[//\]")
-    index_pairs += [(begin + 1, begin + 5) for begin in double_overlap_right_indices]
-
-    # remove ' [/]'
-    single_overlap_right_indices = _find_indices(utterance, r"> \[/\]")
-    index_pairs += [(begin + 1, begin + 4) for begin in single_overlap_right_indices]
-
-    # remove ' [/?]'
-    slash_question_indices = _find_indices(utterance, r"> \[/\?\]")
-    index_pairs += [(begin + 1, begin + 4) for begin in slash_question_indices]
-
-    # remove ' [/-]'
-    slash_dash_indices = _find_indices(utterance, r"> \[/\-\]")
-    index_pairs += [(begin + 1, begin + 4) for begin in slash_dash_indices]
-
-    # remove ' [::'
-    double_error_right_indices = _find_indices(utterance, r"> \[::")
-    index_pairs += [(begin + 1, begin + 4) for begin in double_error_right_indices]
-
-    # remove ' [:'
-    single_error_right_indices = _find_indices(utterance, r"> \[: ")
-    index_pairs += [(begin + 1, begin + 3) for begin in single_error_right_indices]
-
-    right_indices = (
-        double_overlap_right_indices
-        + single_overlap_right_indices
-        + double_error_right_indices
-        + single_error_right_indices
-        + triple_slash_right_indices
-        + slash_question_indices
-        + slash_dash_indices
-    )
-
-    index_pairs = index_pairs + [
-        (angle_brackets_r2l_pairs[right], right) for right in sorted(right_indices)
-    ]
-    indices_to_ignore = set()
-    for left, right in index_pairs:
-        for i in range(left, right + 1):
-            indices_to_ignore.add(i)
-
-    new_utterance = ""
-    for i in range(len(utterance)):
-        if i not in indices_to_ignore:
-            new_utterance += utterance[i]
-    utterance = new_utterance
-
-    utterance = re.sub(r"\S+? \[/\]", "", utterance)
-    utterance = re.sub(r"\S+? \[//\]", "", utterance)
-    utterance = re.sub(r"\S+? \[///\]", "", utterance)
-    utterance = re.sub(r"\S+? \[/\?\]", "", utterance)
-    utterance = re.sub(r"\S+? \[/\-\]", "", utterance)
-
-    utterance = re.sub(r"\S+? \[::", "", utterance)
-    utterance = re.sub(r"\S+? \[:", "", utterance)
-
-    utterance = _remove_extra_spaces(utterance)
-    # print('step 3:', utterance)
-
-    # Step 4: Remove unwanted symbols
-    utterance = re.sub(r"“", "", utterance)
-    utterance = re.sub(r"”", "", utterance)
-
-    utterance = _remove_extra_spaces(utterance)
-
-    # Step 5: Split utterance by spaces and determine whether to keep items.
-
-    escape_prefixes = {
-        "[?",
-        "[/",
-        "[<",
-        "[>",
-        "[:",
-        "[!",
-        "[*",
-        '+"',
-        "+,",
-        "<&",
-    }
-    escape_words = {"0", "++", "+<", "+^", "(.)", "(..)", "(...)", ":", ";"}
-    keep_prefixes = {'+"/', "+,/", '+".'}
-
-    if not phon:
-        escape_words.update({"xxx", "yyy", "www", "xxx:", "yyy:"})
-        escape_prefixes.update({"&"})
-    else:
-        escape_words.update({","})
-        escape_prefixes.update({"0"})
-
-    words = utterance.split()
-    new_words = []
-
-    for word in words:
-        word = re.sub(r"\A<", "", word)  # remove beginning <
-        word = re.sub(r">\Z", "", word)  # remove final >
-        word = re.sub(r"\]\Z", "", word)  # remove final ]
-
-        not_an_escape_word = word not in escape_words
-        no_escape_prefix = not any(word.startswith(e) for e in escape_prefixes)
-        has_keep_prefix = any(word.startswith(k) for k in keep_prefixes)
-
-        if (not_an_escape_word and no_escape_prefix) or has_keep_prefix:
-            new_words.append(word)
-
-    # print('step 5:', remove_extra_spaces(' '.join(new_words)))
-
-    return _remove_extra_spaces(" ".join(new_words))
-
-
 def _clean_word(word):
     """Clean the word.
 
@@ -1925,38 +1680,6 @@ 

Source code for pylangacq.chat

     return new_word
 
 
-def _remove_extra_spaces(inputstr):
-    """Remove extra spaces in *inputstr* so that there are only single spaces.
-
-    Parameters
-    ----------
-    inputstr : str
-
-    Returns
-    -------
-    str
-    """
-    while "  " in inputstr:
-        inputstr = inputstr.replace("  ", " ")
-    return inputstr.strip()
-
-
-def _find_indices(longstr, substring):
-    """Find all indices of non-overlapping ``substring`` in ``longstr``.
-
-    Parameters
-    ----------
-    longstr : str
-    substring : str
-
-    Returns
-    -------
-    list of int
-        List of indices of the long string for where substring occurs
-    """
-    return [m.start() for m in re.finditer(substring, longstr)]
-
-
 class _HTTPSession(requests.Session):
     def __init__(
         self, max_retries: int = 10, backoff_factor: float = 0.1, timeout: int = 60
@@ -1991,7 +1714,7 @@ 

Source code for pylangacq.chat

   

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/_sources/data.rst.txt b/docs/_sources/data.rst.txt index a180e5d..b22ce79 100644 --- a/docs/_sources/data.rst.txt +++ b/docs/_sources/data.rst.txt @@ -65,6 +65,34 @@ the CC BY-NC-SA 3.0 license. As of March 2021, the following Cantonese-related datasets are available from CHILDES and TalkBank (in alphabetical order): +.. invisible-code-block: python + + >>> import os + +.. skip: start if(os.getenv("CI") == "true", reason="certain CHILDES data pulls fail in some but not all python versions for unknown reasons") + +* `Child Heritage Chinese Corpus `_ + + .. code-block:: python + + >>> url = "https://childes.talkbank.org/data/Biling/CHCC.zip" + >>> corpus = pycantonese.read_chat(url) + >>> corpus.n_files() + 190 + >>> len(corpus.words()) + 533877 + +* `Guthrie Bilingual Corpus `_ + + .. code-block:: python + + >>> url = "https://childes.talkbank.org/data/Biling/Guthrie.zip" + >>> corpus = pycantonese.read_chat(url) + >>> corpus.n_files() + 36 + >>> len(corpus.words()) + 70438 + * `HKU-70 Corpus `_ .. code-block:: python @@ -76,12 +104,6 @@ available from CHILDES and TalkBank (in alphabetical order): >>> len(corpus.words()) 178270 -.. invisible-code-block: python - - >>> import os - -.. skip: start if(os.getenv("CI") == "true", reason="certain CHILDES data pulls fail in some but not all python versions for unknown reasons") - * `Lee-Wong-Leung Corpus `_ .. code-block:: python diff --git a/docs/api.html b/docs/api.html index 1d61373..3ade092 100644 --- a/docs/api.html +++ b/docs/api.html @@ -1601,7 +1601,7 @@

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/changelog.html b/docs/changelog.html index 9da0ba6..02268f4 100644 --- a/docs/changelog.html +++ b/docs/changelog.html @@ -641,7 +641,7 @@

[0.1] - 2014-12-17

- © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

diff --git a/docs/data.html b/docs/data.html index 7ed409e..0e3b2aa 100644 --- a/docs/data.html +++ b/docs/data.html @@ -350,6 +350,30 @@

CHILDES and TalkBank DataChild Heritage Chinese Corpus

+
+
>>> url = "https://childes.talkbank.org/data/Biling/CHCC.zip"
+>>> corpus = pycantonese.read_chat(url)
+>>> corpus.n_files()
+190
+>>> len(corpus.words())
+533877
+
+
+
+ +
  • Guthrie Bilingual Corpus

    +
    +
    >>> url = "https://childes.talkbank.org/data/Biling/Guthrie.zip"
    +>>> corpus = pycantonese.read_chat(url)
    +>>> corpus.n_files()
    +36
    +>>> len(corpus.words())
    +70438
    +
    +
    +
    +
  • HKU-70 Corpus

    >>> url = "https://childes.talkbank.org/data/Chinese/Cantonese/HKU.zip"
    @@ -362,8 +386,6 @@ 

    CHILDES and TalkBank DataLee-Wong-Leung Corpus

    diff --git a/docs/generated/pycantonese.CHATReader.search.html b/docs/generated/pycantonese.CHATReader.search.html index e89d5df..31d4017 100644 --- a/docs/generated/pycantonese.CHATReader.search.html +++ b/docs/generated/pycantonese.CHATReader.search.html @@ -400,7 +400,7 @@

    pycantonese.CHATReader.search

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/generated/pycantonese.characters_to_jyutping.html b/docs/generated/pycantonese.characters_to_jyutping.html index 552c60a..ef3fd98 100644 --- a/docs/generated/pycantonese.characters_to_jyutping.html +++ b/docs/generated/pycantonese.characters_to_jyutping.html @@ -363,7 +363,7 @@

    pycantonese.characters_to_jyutping

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/generated/pycantonese.hkcancor.html b/docs/generated/pycantonese.hkcancor.html index 9992063..66e3640 100644 --- a/docs/generated/pycantonese.hkcancor.html +++ b/docs/generated/pycantonese.hkcancor.html @@ -337,7 +337,7 @@

    pycantonese.hkcancor

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/generated/pycantonese.jyutping_to_tipa.html b/docs/generated/pycantonese.jyutping_to_tipa.html index 826f446..d991c11 100644 --- a/docs/generated/pycantonese.jyutping_to_tipa.html +++ b/docs/generated/pycantonese.jyutping_to_tipa.html @@ -358,7 +358,7 @@

    pycantonese.jyutping_to_tipa

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/generated/pycantonese.jyutping_to_yale.html b/docs/generated/pycantonese.jyutping_to_yale.html index 070a363..86d13f8 100644 --- a/docs/generated/pycantonese.jyutping_to_yale.html +++ b/docs/generated/pycantonese.jyutping_to_yale.html @@ -374,7 +374,7 @@

    pycantonese.jyutping_to_yale

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

  • diff --git a/docs/generated/pycantonese.parse_jyutping.html b/docs/generated/pycantonese.parse_jyutping.html index 0ff328a..32b9944 100644 --- a/docs/generated/pycantonese.parse_jyutping.html +++ b/docs/generated/pycantonese.parse_jyutping.html @@ -357,7 +357,7 @@

    pycantonese.parse_jyutping

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/generated/pycantonese.pos_tag.html b/docs/generated/pycantonese.pos_tag.html index 0cb6fb5..19b3e99 100644 --- a/docs/generated/pycantonese.pos_tag.html +++ b/docs/generated/pycantonese.pos_tag.html @@ -382,7 +382,7 @@

    pycantonese.pos_tag

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/generated/pycantonese.pos_tagging.hkcancor_to_ud.html b/docs/generated/pycantonese.pos_tagging.hkcancor_to_ud.html index 329753e..d11c80c 100644 --- a/docs/generated/pycantonese.pos_tagging.hkcancor_to_ud.html +++ b/docs/generated/pycantonese.pos_tagging.hkcancor_to_ud.html @@ -363,7 +363,7 @@

    pycantonese.pos_tagging.hkcancor_to_ud

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/generated/pycantonese.read_chat.html b/docs/generated/pycantonese.read_chat.html index 9e46ef2..87e5f98 100644 --- a/docs/generated/pycantonese.read_chat.html +++ b/docs/generated/pycantonese.read_chat.html @@ -366,7 +366,7 @@

    pycantonese.read_chat

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/generated/pycantonese.segment.html b/docs/generated/pycantonese.segment.html index 2c2af0b..7727813 100644 --- a/docs/generated/pycantonese.segment.html +++ b/docs/generated/pycantonese.segment.html @@ -370,7 +370,7 @@

    pycantonese.segment

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/generated/pycantonese.stop_words.html b/docs/generated/pycantonese.stop_words.html index 54c4400..ca1e2bb 100644 --- a/docs/generated/pycantonese.stop_words.html +++ b/docs/generated/pycantonese.stop_words.html @@ -364,7 +364,7 @@

    pycantonese.stop_words

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/generated/pycantonese.word_segmentation.Segmenter.html b/docs/generated/pycantonese.word_segmentation.Segmenter.html index de6a0f0..4b5c9de 100644 --- a/docs/generated/pycantonese.word_segmentation.Segmenter.html +++ b/docs/generated/pycantonese.word_segmentation.Segmenter.html @@ -384,7 +384,7 @@

    pycantonese.word_segmentation.Segmenter

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/genindex.html b/docs/genindex.html index 2dafb2e..6c87a91 100644 --- a/docs/genindex.html +++ b/docs/genindex.html @@ -570,7 +570,7 @@

    W

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/index.html b/docs/index.html index bec83bc..1b75d95 100644 --- a/docs/index.html +++ b/docs/index.html @@ -476,7 +476,7 @@

    Table of Contents

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/jyutping.html b/docs/jyutping.html index 03a6ef3..1680b6f 100644 --- a/docs/jyutping.html +++ b/docs/jyutping.html @@ -471,7 +471,7 @@

    Jyutping-to-TIPA Conversion

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/papers.html b/docs/papers.html index 2765416..adc0675 100644 --- a/docs/papers.html +++ b/docs/papers.html @@ -329,7 +329,7 @@

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/pos_tagging.html b/docs/pos_tagging.html index 43262f1..10140f8 100644 --- a/docs/pos_tagging.html +++ b/docs/pos_tagging.html @@ -366,7 +366,7 @@

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/reader.html b/docs/reader.html index d379c82..17bf670 100644 --- a/docs/reader.html +++ b/docs/reader.html @@ -591,7 +591,7 @@

    Word Frequencies and Ngrams

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/search.html b/docs/search.html index f45ddde..1618b22 100644 --- a/docs/search.html +++ b/docs/search.html @@ -321,7 +321,7 @@

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/searches.html b/docs/searches.html index 224c63f..a0ee23a 100644 --- a/docs/searches.html +++ b/docs/searches.html @@ -584,7 +584,7 @@

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/searchindex.js b/docs/searchindex.js index 1887a5b..a3729f2 100644 --- a/docs/searchindex.js +++ b/docs/searchindex.js @@ -1 +1 @@ -Search.setIndex({docnames:["api","changelog","data","generated/pycantonese.CHATReader","generated/pycantonese.CHATReader.search","generated/pycantonese.characters_to_jyutping","generated/pycantonese.hkcancor","generated/pycantonese.jyutping_to_tipa","generated/pycantonese.jyutping_to_yale","generated/pycantonese.parse_jyutping","generated/pycantonese.pos_tag","generated/pycantonese.pos_tagging.hkcancor_to_ud","generated/pycantonese.read_chat","generated/pycantonese.segment","generated/pycantonese.stop_words","generated/pycantonese.word_segmentation.Segmenter","index","jyutping","papers","pos_tagging","reader","searches","stop_words","word_segmentation"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":3,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":2,"sphinx.domains.rst":2,"sphinx.domains.std":1,"sphinx.ext.intersphinx":1,"sphinx.ext.viewcode":1,sphinx:56},filenames:["api.rst","changelog.rst","data.rst","generated/pycantonese.CHATReader.rst","generated/pycantonese.CHATReader.search.rst","generated/pycantonese.characters_to_jyutping.rst","generated/pycantonese.hkcancor.rst","generated/pycantonese.jyutping_to_tipa.rst","generated/pycantonese.jyutping_to_yale.rst","generated/pycantonese.parse_jyutping.rst","generated/pycantonese.pos_tag.rst","generated/pycantonese.pos_tagging.hkcancor_to_ud.rst","generated/pycantonese.read_chat.rst","generated/pycantonese.segment.rst","generated/pycantonese.stop_words.rst","generated/pycantonese.word_segmentation.Segmenter.rst","index.rst","jyutping.rst","papers.rst","pos_tagging.rst","reader.rst","searches.rst","stop_words.rst","word_segmentation.rst"],objects:{"pycantonese.CHATReader":{__init__:[3,1,1,""],ages:[0,1,1,""],append:[0,1,1,""],append_left:[0,1,1,""],characters:[0,1,1,""],clear:[0,1,1,""],dates_of_recording:[0,1,1,""],extend:[0,1,1,""],extend_left:[0,1,1,""],file_paths:[0,1,1,""],from_dir:[0,1,1,""],from_files:[0,1,1,""],from_strs:[0,1,1,""],from_zip:[0,1,1,""],headers:[0,1,1,""],ipsyn:[0,1,1,""],jyutping:[0,1,1,""],languages:[0,1,1,""],mlu:[0,1,1,""],mlum:[0,1,1,""],mluw:[0,1,1,""],n_files:[0,1,1,""],participants:[0,1,1,""],pop:[0,1,1,""],pop_left:[0,1,1,""],search:[4,1,1,""],sents:[0,1,1,""],tagged_sents:[0,1,1,""],tagged_words:[0,1,1,""],tokens:[0,1,1,""],ttr:[0,1,1,""],utterances:[0,1,1,""],word_frequencies:[0,1,1,""],word_ngrams:[0,1,1,""],words:[0,1,1,""]},"pycantonese.corpus":{Token:[0,0,1,""]},"pycantonese.jyutping":{Jyutping:[0,0,1,""]},"pycantonese.jyutping.Jyutping":{"final":[0,1,1,""],__str__:[0,1,1,""]},"pycantonese.pos_tagging":{hkcancor_to_ud:[11,2,1,""]},"pycantonese.word_segmentation":{Segmenter:[15,0,1,""]},"pycantonese.word_segmentation.Segmenter":{__init__:[15,1,1,""]},pycantonese:{CHATReader:[3,0,1,""],characters_to_jyutping:[5,2,1,""],hkcancor:[6,2,1,""],jyutping_to_tipa:[7,2,1,""],jyutping_to_yale:[8,2,1,""],parse_jyutping:[9,2,1,""],pos_tag:[10,2,1,""],read_chat:[12,2,1,""],segment:[13,2,1,""],stop_words:[14,2,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:function"},terms:{"000":19,"001":20,"001_v2":20,"100":[10,11,19,22],"104":[1,14,22],"105":[14,22],"107":22,"1177307":2,"127":[],"12715":21,"13251":20,"134":20,"140":20,"150":19,"153654":2,"160":2,"161":2,"167":20,"16730":2,"1681":21,"178270":2,"186":20,"1949480":2,"195":21,"197":17,"1997":20,"1st":21,"2015":[16,18],"2016":18,"202":20,"2020":[16,17,23],"2021":2,"209":20,"21167":21,"219":20,"22328":21,"223415":2,"2259":20,"2570":20,"2734":20,"2741":20,"2755":20,"29012":[],"2911":20,"29726":[16,21],"29954":21,"30th":20,"3rd":18,"4110":20,"501":2,"5019":20,"520":20,"527":20,"705":21,"9282":20,"\u4e00\u5572":[14,22],"\u4e00\u5b9a":[14,22],"\u4e03":21,"\u4e0d\u5982":[14,22],"\u4e0d\u904e":[14,21,22],"\u4e5d\u9f8d":22,"\u4f4f":21,"\u4f60":[20,21],"\u4f62":20,"\u4fc2":[16,20,21],"\u505c\u7528\u8a5e":16,"\u505c\u7528\u8bcd":16,"\u516b\u6708":21,"\u5187\u5f97":[16,21],"\u5206\u8a5e":16,"\u5206\u8bcd":16,"\u53bb":[16,20,21],"\u53ef\u4ee5":21,"\u5416":20,"\u5462":20,"\u54aa":21,"\u5514":[13,20,23],"\u5514\u4fc2":21,"\u5514\u8a72":17,"\u554a":[20,21],"\u5572":20,"\u5582":20,"\u5587":[20,21],"\u558e":20,"\u55ce":20,"\u55f0":[10,19],"\u55f0\u500b":21,"\u55f0\u908a":21,"\u55f1":21,"\u5649":20,"\u565a\u65e5":[10,19],"\u56d6":[20,21],"\u597d":[16,17,21],"\u5b78":[13,16,23],"\u5bb9":[13,23],"\u5bb9\u5514\u5bb9\u6613":[13,23],"\u5bb9\u6613":[13,23],"\u5c0d":[10,19],"\u5c31":20,"\u5e7e":21,"\u5e7f\u4e1c\u8bdd":16,"\u5ee3\u6771":23,"\u5ee3\u6771\u8a71":[5,7,8,9,13,16,17,20,23],"\u5ee3\u6771\u8a71\u597d\u96e3\u5b78":16,"\u5ee3\u6771\u8a71\u5bb9\u5514\u5bb9\u6613\u5b78":[13,23],"\u6211":[10,19,20],"\u6211\u565a\u65e5\u8cb7\u55f0\u5c0d\u978b":19,"\u62b5":21,"\u65b0\u754c":22,"\u65c5":20,"\u65c5\u884c":[16,20,21],"\u65e5":21,"\u6709\u5187":[16,21],"\u6709\u5f97":[16,21],"\u6a5f":21,"\u6a5f\u7968":21,"\u6c23\u5019":[8,17],"\u6de1\u5b63":21,"\u73a9":21,"\u76f4\u7a0b":21,"\u771f\u4fc2":21,"\u7793\u89ba":21,"\u789f":21,"\u789f\u5f62":21,"\u7ca4\u62fc":16,"\u7ca4\u8bed":16,"\u7cb5":16,"\u7cb5\u62fc":16,"\u7cb5\u8a9e":16,"\u7da0":21,"\u8072\u6bcd":21,"\u807d":21,"\u81ea\u7136\u8a9e\u8a00\u8655\u7406":16,"\u81ea\u7136\u8bed\u8a00\u5904\u7406":16,"\u86cb":17,"\u86cb\u7cd5":17,"\u884c":20,"\u8981":[16,21],"\u8a5e\u6027\u6a19\u6ce8":16,"\u8a71":[21,23],"\u8a92":20,"\u8a9e\u8a00\u5b78":16,"\u8b1b":[5,16,17,21],"\u8b8a\u97f3":17,"\u8bcd\u6027\u6807\u6ce8":16,"\u8bed\u8a00\u5b66":16,"\u8cb7":[10,19],"\u8cca":21,"\u8ddf":21,"\u8fea\u58eb\u5c3c":21,"\u904e":21,"\u9072":20,"\u90fd":21,"\u96c0":21,"\u96e3":16,"\u978b":[10,19],"\u97fb\u6bcd":[17,21],"\u98db\u6a5f":21,"\u9999\u6e2f":[14,22],"\u9999\u6e2f\u4eba":[5,16,17],"\u9999\u6e2f\u4eba\u8b1b\u5ee3\u6771\u8a71":[5,16,17],"\u9999\u6e2f\u5cf6":22,"\ud842\udfa9\ud843\ude4c":1,"\ud843\udd15":1,"\ud843\udd15":[],"\ud843\ude9d":1,"\ud843\ude9d":[],"\ud843\udea2":[1,20],"\ud843\udea2":[],"\ud843\uded7":1,"\ud844\udc14":1,"\ud844\udc14":[],"\ud844\udc5c":1,"\ud844\udc5c":[],"\ud844\udcc9":[20,21],"\ud844\udcc9":[],"\ud844\udcd3":1,"\ud844\udcd3":[],"\ud854\ude99":1,"\ud854\ude99":[],"\ud85d\udd74":1,"\ud85d\udd74":[],"case":[0,3,17,21,22],"char":5,"class":[0,1,3,13,15,17,20,23],"computational linguist":16,"d\u016bng":[8,17],"default":[0,1,4,8,10,12,13,19,21,22,23],"final":[0,1,4,17,21],"float":0,"function":[1,2,5,7,8,11,17,19,20,21,22,23],"g\u014di":17,"gw\u00f3ng":[8,17],"gw\u00f3ngd\u016bngw\u00e1":[8,17],"import":[2,13,16,17,19,20,21,22,23],"int":[0,4,15,21],"natural language process":16,"new":[0,1,5,7,8,10,11,13,14,15,22],"null":1,"part-of-speech tag":16,"return":[0,1,4,5,6,7,8,9,10,11,12,13,14,17,20,21,22,23],"stop word":16,"super":[7,17],"switch":[1,8],"true":[0,1,4,8,14,20,21,22],"while":[17,20,21],"word segment":16,Added:13,Eve:[0,12],For:[0,2,4,11,12,20,21],Its:19,One:[0,4],POS:10,Such:0,The:[0,1,2,4,5,10,12,13,16,17,18,19,20,21,22,23],There:10,Used:1,With:[16,21],__init__:[3,15],__str__:0,__version__:16,aa3:[20,21],abil:17,abl:[2,17],about:[19,22],abov:[1,21],accept:19,access:[1,16,17,20,21],accommod:20,acquisit:[0,2,3],adam:[0,12],add:[14,17,22],addit:20,adjust:21,adopt:[1,2],adult:20,adv:[10,19],after:[0,4,20],age:[0,16,18,20],ages:[0,20],albino:16,algorithm:23,all:[0,1,2,4,16,20,21,22],all_verb:[16,21],allow:[1,10,13,15,21,23],allow_remot:0,alon:21,alphabet:[2,16],alreadi:[0,19,21],also:[17,20,21,22],alwai:21,ambigu:[8,17],american:[0,12],among:17,analysi:21,ani:[0,5,11,17,21],annot:[1,2,11,16,19,21],anonym:20,anoth:[0,17],anyth:2,api:16,append:0,append_left:0,appli:[0,1,10,12],applic:[0,3,11],approach:[13,19],appropri:[0,20],apr:20,april:20,arbitrari:0,argument:[1,10,13,17,19,20,22,23],aris:17,around:[20,21],artist:16,as_list:[1,8,17],ask:21,associ:[2,20],attempt:0,attribut:[0,17,20],audio:20,augment:20,author:16,automat:[17,21],avail:[1,2,20],averag:[10,19],baat3jyut6:21,back:1,ban:23,base:[5,17,21],basic:[1,19],bat1gwo3:21,bear:21,becaus:[21,23],becom:0,been:[0,1,2,16,19,20],befor:[0,1,4],begin:[0,12,16,20,21],behavior:[13,17],being:8,below:[1,20],benefit:11,better:1,between:[8,17,20],beyond:2,big:[16,18],bile:[2,12],bilingu:[2,16,18],bool:[0,4,8],both:[0,1,2,20],bought:[10,19],boundari:[8,23],brown:[0,12],bug:[1,16],build:1,built:[1,20,21],bump:1,by_fil:[0,4,20],by_token:[0,1,4,21],by_utter:[0,1,4,20,21],caak2:21,cake:17,call:[17,20,21],can:[0,2,11,12,17,20,21,22,23],cannot:0,cantones:[0,1,2,3,4,5,6,7,8,9,10,12,13,14,17,18,19,20,21,22,23],cap:1,capabl:[20,21],capit:0,cat1:21,cathug:16,centr:[16,18],certain:23,cha:[0,2,12,20],chang:[5,8,13,17,21],changelog:16,chao:17,charact:[0,1,4,5,7,8,9,10,13,16,23],character_s:[0,1,3],characters2jyutp:[1,5],characters_to_jyutp:[1,16,17],charl:16,chat:[0,1,3,12,20],chatread:[1,2,6,12,20],check:2,chen:[16,18],chi:[0,20],child:[0,12,20],childhood:[16,18],children:[0,12],chim:16,chines:[0,2,4,16,17,18],ci4:20,circleci:1,classmethod:[0,2],clear:0,climat:[8,17],cls:[13,23],coda:[0,1,4,9,16,17,21],codas_ptk:21,code:[1,2,9,16,20,21],collaps:0,collect:[0,20],colloc:21,com:16,combin:[0,19,21],come:[0,1,2,16,19],common:[17,21],commonli:17,compl:[10,11],complet:21,compon:17,comput:18,concurr:23,confus:17,conson:[8,17],constrain:23,constraint:21,consult:2,contact:16,contain:[0,2,4,12,13,17,21,23],contextu:17,contrast:20,contribut:16,control:[2,20,23],conveni:[2,20],convent:[0,1,20,23],convers:[1,2,5,16,21],convert:[1,2,5,7,8,10,17],corpora:2,corpu:[1,3,5,6,12,13,16,17,19,23],correct:17,correspond:[0,20],count:20,counter:[0,20],counterpart:[1,2],cover:1,creat:[6,20],criteria:[0,4],criterion:[1,2],cross:[11,19],current:[0,2,16,17,19,20,23],custom:[1,13,20,21],customiz:[1,15],daam6gwai3:21,daan2:17,daan6gou1:17,dai2:21,dai:0,data:[1,4,5,10,11,12,13,16,17,18,19,20,21,22,23],dataset:[0,1,2,12,16,20],date:[0,20],dates_of_record:0,datetim:[0,20],deal:20,dedic:20,defin:[1,20],demograph:20,depend:[1,10,11,17,19,22,23],deprec:[0,4,5,7,8],describ:[0,10,11,19,21],design:[2,16,21],detail:[1,16,20],detect:17,determin:[0,22],develop:18,di1:20,dict:[0,11,20],dictioari:11,dictionari:[11,17],differ:[20,21],difficult:16,dik6si6nei4:21,dip2:21,dip2jing4:21,direct:[0,1],directli:17,directori:[0,2,12],disabl:[0,1,4],disallow:[1,13,15,23],disambigu:[8,17],discours:21,dist:17,distinct:0,distribut:21,doc:1,docstr:1,document:[1,2,20],doe:[0,12,13],domain:1,domin:0,done:0,dou1:21,download:[0,2],drive:2,drop:[0,1],due:[17,19],duplic:0,dut2:21,each:[0,1,5,10,21],easi:[13,17,23],edu:[10,11],educ:20,egg:17,either:[0,12,22],element:[0,1,7,8,9,20],email:16,empti:3,encod:[0,1,12],end:[17,23],eng:[0,20],english:[0,12,20,22,23],enough:17,entir:[11,23],entri:1,equival:[0,1,4,5,7,8,17,21],error:[1,17],especi:19,etc:[16,20,21],european:0,even:0,exampl:[0,4,5,7,8,9,10,11,12,13,14,17,20,21,22],exclud:[0,1,4,12,20],exist:[0,23],expect:19,explicitli:23,expos:[1,17,19],express:[0,12,16,21],extend:0,extend_left:0,extens:0,facebook:16,fact:19,fals:[0,1,4,8,14,17,21],fan3gaau3:21,fat:20,father:20,favor:1,featur:16,feedback:16,fei1gei1:21,femal:20,file:[0,1,2,3,4,12,17,20],file_path:0,filenam:20,filter:[0,12,20,22],find:[16,21],fine:2,first:[0,20,21],five:[13,23],flavor:20,flexibl:21,folder:1,follow:[0,1,2,12,21,23],forc:[0,1],form:[0,1,4,23],format:[1,16,20],found:[20,21,23],frequenc:0,from:[0,1,2,3,8,10,11,12,13,16,17,19,20,21,22,23],from_dir:[0,2],from_fil:[0,2],from_str:[0,2],from_zip:[0,2],full:21,further:[17,19,20],futur:17,gaa3:21,gan1:21,gei1:21,gei1piu3:21,gei2:21,gender:20,gener:[0,1,2,4,21],get:[16,17,19,20],github:[1,16],give:21,given:[0,1,2,4,11,20,21],go2bin1:21,go2go3:21,gong2:[5,16,17,21],good:17,gra:[0,16,20,21],grab:21,grain:2,grammat:0,granular:11,group:20,gwo3:21,gwong2dung1waa2:[5,7,8,9,16,17],hai6:[16,21],han:16,handl:[0,2,12,16,17,20],handout:18,has:[1,2,8,16,17,19,20,22,23],hauh:[8,17],have:[0,1,2,16,17,19,20,21],header:0,hei3hau6:[8,17],hei6au6:[8,17],hei:[8,17],heihauh:[8,17],helper:19,heoi3:[16,20,21],here:[0,2,12,20,21,23],high:[17,21],hill:16,him:[16,18],hkcancor:[1,2,5,10,11,13,16,17,19,20,21,23],hkcancor_to_ud:[1,10,19],hku:2,ho2ji5:21,hoeng1gong2jan4:[5,16,17],hong:[1,2,6,16,18,20,22],hongkong:[5,16,17],hood:2,hou2:[17,21],hou7:17,how:[20,21],howev:23,hss:[10,11],html:[0,4,10,11],http:[0,1,2,4,10,11,12,16],hyperlink:20,ident:20,identifi:[0,20],ids:0,ignor:[0,4,21],illeg:[7,8,9],illustr:20,implement:[1,16,20,23],improv:1,includ:[0,1,4,13,16,17,20,21,22,23],inconveni:17,incorpor:[2,16],independ:[17,20],index:[0,10,11],indic:0,individu:[0,1,4,5,16,17],inform:[0,20,21],ingest:17,inherit:[0,3,20],initi:[0,1,3,4,15,21],innov:18,input:[1,10,11,13,19],instagram:16,instanc:[0,20,21,23],instanti:0,instead:[0,1,4,17],integ:23,intellig:17,intention:0,interest:[0,17,21],intern:[1,10],internet:0,interpret:16,introduc:[16,18,20],intuit:[],inv:0,invalid:17,investig:19,involv:[17,21],ipsyn:0,island:22,issu:[1,16],issubset:22,iter:[0,4,14,15,21,22,23],its:[1,2,8,10,16,17,20,21],jackson:[16,18],jacksonlle:16,jat6:21,jau5dak1:[16,21],jau5mou5:[16,21],jenni:16,jiu3:[16,21],jp_str:[7,8,9],json:23,just:[0,4,20,21],jyut6:17,jyutp:[1,2,3,4,5,7,8,9,16],jyutping2tipa:[1,7],jyutping2yal:[1,8],jyutping_s:[0,1,3],jyutping_to_tipa:[1,17],jyutping_to_x:[1,8],jyutping_to_yal:[1,17],jyutpingi1:[],keep:21,keep_cas:0,kept:0,keyword:[1,13,17,19,23],kind:21,known:[],kong:[1,2,6,16,18,20,22],kowloon:22,koy55:17,kwarg:23,laa1:21,laa4:21,lai:16,lam:16,languag:[2,3,11,16,17,19,20,22,23],last:[0,16,17],latex:[7,17],learn:[13,16,23],leav:0,lee:[2,16,18],leewongleung:2,left:[0,4,21],len:[0,2,14,16,21,22],length:[0,1,13,15,23],leo:2,leoi5hang4:[16,20,21],less:11,let:20,letter:[0,17,20,21],leung:2,level:[0,17,20,21],lib:17,librari:[2,13,16,17,23],licens:[1,2,17,19,23],like:[0,2,19,21,22,23],likewis:21,limit:19,line:[1,8,17,20],linguist:[2,11,18,19,21],link:1,list:[0,1,4,5,7,8,9,10,13,17,20,21],litong:[16,18],lo1:21,load:23,local:[0,2,12,17],longer:13,longest:[13,23],look:0,loop:21,low:[8,17],lowercas:0,luk2:21,m4goi1:17,m4hai6:21,machin:[2,21],mai6:21,mai:[0,3,17,21,22],maintein:16,major:[19,20],make:[0,17],mani:[0,2,20,22,23],manual:2,map:[1,11,19],march:[2,18],mark:5,marker:[8,17,20],match:[0,4,12,13,21,23],materi:20,matter:5,matthew:2,max_word_length:[15,23],maxim:13,maximum:[1,13,15,23],mean:[0,2,16,19,21],meaning:20,media:16,meet:2,memori:0,metadata:20,method:[0,1,3,15,16,21],might:[17,19],mit:16,mix:21,mlu:0,mlum:0,mluw:0,model:[1,2,5,10,13,15,17,19,23],modifi:16,modul:17,month:0,moon:[],mor:[0,16,20,21],more:[0,1,2,4,16,19,20,21,23],morphem:0,morpholog:[0,20],most:[0,16,17],most_common:20,mot:[0,20],mother:20,mou5dak1:[16,21],multipl:[7,8,9,17],n_file:[0,2],name:[1,16,20],nasal:[1,17],natur:[2,11,16,19,22,23],naturalist:2,necessari:[17,22,23],necessit:17,need:[2,17,21],nei5:[20,21],neighbor:21,neither:21,ngram:0,nltk:1,none:[0,4,5,11,12,13,14,15,16,17,20,21],nongra:[],nor:21,note:[1,16,18],noun:[10,19],now:[1,5],ntu:[10,11],nuclei:17,nucleu:[0,1,4,9,16,17,21],number:[0,2,16,17,20,21],number_of_charact:1,number_of_word:1,numer:17,numpydoc:1,object:[0,1,6,13,15,17,20,21,23],obtain:2,occurr:19,odd:19,off:19,offer:17,often:[22,23],ohio:18,oken:[],on25:[7,17],one:[0,1,2,4,7,8,9,10,12],ones:22,onli:[0,1,4,12,19,21],onset:[0,1,4,8,9,16,17,21],onward:1,open:1,option:[0,4,8,10,11,12,13,14,15,20,22],orb:1,order:[0,2,16,20],org:[0,1,2,4,10,11,12],organ:[0,4],origin:[1,2,10,11,16,19,20],orthograph:0,other:[0,1,2,8,17,21],otherwis:[0,4,17,21],out:2,output:[0,4,5,8,16,17,20],over:[10,11,19],overal:1,own:[2,20],packag:[0,1,17,20],page:20,paidocantones:2,paidologo:2,pair:[10,19],paramet:[0,1,4,5,7,8,9,10,11,12,13,14,15,21],parent:[0,3],pars:[0,1,2,9,12,16,20,21],parse_jyutp:[1,16,17],parser:1,part:[0,1,2,4,8,10,11,16,17,20],particip:[0,1,4,20],particl:21,particular:21,particularli:[2,17,20],pass:[0,12,23],path:[0,1,2,4,12],perceptron:[10,19],perform:21,perhap:19,permiss:[16,19],phonbank:2,phonolog:[0,4,17],phrase:[1,10,19],pick:1,piggyback:1,pin:1,pinjam:[17,21],pip:16,placehold:20,pleas:[0,2,4,16,17,19,20],plu:[0,1,4,20,21],point:[1,2,12,20],pop:0,pop_left:0,pos:[0,4,10,11,16,20,21],pos_tag:[1,19],possibl:[1,2,21],potenti:[17,19,23],power:21,pprint:[],preced:21,predict:10,preprocess:0,preserv:20,preval:[0,4],previou:[8,17],previous:[1,5],primer:2,print:[16,20,21],process:[11,16,17,19,20,22,23],product:0,pron:[10,19],pronoun:22,pronunci:17,properti:0,prove:17,provid:[0,2,11,12,13,17,19,21,22,23],ptk:21,ptk_tone2:21,publicli:2,punct:[10,19],punctuat:5,purpos:[1,11,21],pycantones:[0,1,2,17,18,19,20,21,22,23],pylangacq:[0,1,2,20],pypi:1,python3:17,python:[1,2,17,21],qualiti:[1,19],queri:[2,16],quot:[8,17],rachel:16,rais:[0,1,4,7,8,9,10,17],random:0,rang:[0,1,4],rather:1,ratio:0,read:[0,1,2,12,17],read_chat:2,readabl:2,reader:[0,1,3,16,17],readm:[2,16],readthedoc:1,reason:2,recent:[2,16,17],record:[0,20],recurs:[0,12],refer:16,regex:[0,4,21],regular:[0,12,16,21],rel:19,relat:[0,2],releas:[1,16,17,23],relev:17,remot:0,remov:[0,14,22],rendit:20,replac:[1,5,7,8],report:16,repr:17,repres:[0,1,5,17,21],represent:[0,17,20],request:16,requir:1,research:[2,16,21],resourc:[16,17],restructur:1,result:[0,1,16,17,19,23],retriev:17,revis:1,rich:2,richielo:16,right:[0,2,4,21],rime:[1,5,13,16,17,23],rime_cantones:16,rise:[17,21],robin:16,role:20,roman:[1,2,4,5,7,8,9,16,21],rst:1,ryan:16,rylanchiu:16,sai:20,same:[2,20,21,23],sarah:[0,12],satisfi:0,scheme:17,search:[0,1,2,16],second:21,see:[0,2,4,10,16,17,20,21],seem:17,segment:[0,1,2,4,5,10,16,17,19,20],semant:21,sens:0,sent:[0,1,4],sent_rang:[0,1,4],sentenc:[0,1,10,13,19],separ:17,septemb:[16,18],ses:20,session:20,set:[0,4,13,14,17,21,22],sever:[2,20],sex:20,shoe:[10,19],show:[16,21],similarli:22,simpl:23,simpli:[0,20],sinc:[0,2,12,19,20,21],singl:[2,8,12,21],situat:23,size:19,slide:[16,18,21],small:19,snippet:1,snowman:16,social:16,some:[0,3],some_token:20,sophist:19,sort:0,sourc:[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17],space:23,span:[0,4,21],speak:[5,16,17],speaker:20,specif:[1,19,20,21],specifi:[0,12,21,23],speech:[0,1,2,4,10,11,16,20],spot:20,stabl:16,standalon:17,standard:19,state:18,statist:19,statu:20,stdin:17,step:0,stephan:16,still:[17,19,23],stiller:16,stop:[1,14,16],stop_word:22,stop_words_1:[14,22],stop_words_2:[14,22],store:20,str:[0,4,5,7,8,9,10,11,12,13,14,15,17],straightforward:17,string:[0,1,4,5,8,10,12,13,20,21,22,23],strip:1,structur:[0,12,20],studi:[0,20,21],style:1,substanti:16,suffix:17,suit:1,suppli:1,support:[0,1,4,10],syllab:[1,17],syllabl:[8,17],syntax:[0,17,21],system:17,tag:[0,1,2,4,10,11,16,20],tagged_s:0,tagged_word:0,tagger:[10,19],tagset:[1,10,11,19,21],take:[1,2,17,19,21,22,23],talk:[16,18],talkbank:[0,12,20],tape:20,target:[0,20,21],task:[11,17,21,22,23],teach:17,teng1:21,term:[0,4,16],territori:22,test:[1,16],text:[0,1,4,12,20,21,23],than:[0,1,2,4,13,19,21,23],thank:[2,17],thei:[0,1,2],theme:1,thi:[0,1,2,5,7,8,10,11,12,13,15,16,17,20,21,23],think:[19,20,23],those:[0,2,17],though:23,three:20,through:[20,21],thrown:19,tier:20,time:[20,21],time_mark:20,tipa:[1,7],todai:17,togeth:0,token:[4,16,17,20,21],tone2:21,tone:[0,4,8,9,16,17,21],tool:[16,17,18,20],top:20,touch:19,traceback:17,track:21,tracker:16,tradit:[0,4],train:[10,13,19,23],transcrib:2,transcript:0,translat:20,transpar:1,travel:16,treat:[0,17,22],trigram:20,trivial:17,tsui:[16,18],tsz:[16,18],ttr:0,tun55:[7,17],tupl:[0,1,4,5,10,21],turn:[1,17],twitter:16,two:[17,20,21],txt:16,type:0,typeerror:10,typic:[0,20],ubiquit:20,unclear:8,under:[0,2,12],underli:[1,17,20,23],unicod:[0,12],union:0,univers:[1,10,11,16,18,19],universaldepend:[10,11],unrecogn:[7,8,9,11],unseen:[5,17],unseg:[1,10,13,19,23],unspecifi:[0,4],unzip:[0,2,12],updat:1,upgrad:16,url:[0,2,12],usag:20,use:[0,1,2,3,4,17,20,21,22],used:[0,2,4,10,13,17,20,21,23],useful:[2,21],usefulness:17,user:[1,17],uses:[2,10,11,19,20],using:[17,21],usr:17,usual:20,utf8:20,utf:[0,1,12],utter:[0,2,4,20],utterance_rang:[0,1,4,21],uuid:0,valid:17,valu:[0,1,5,8,12,17,21],valueerror:[0,4,7,8,9,10,17],varieti:21,variou:[1,17,20,21],verb:[10,11,16,19,21],version:[0,1,5,7,8,10,11,13,14,15,16,17,23],via:2,visual:20,wa25:[7,17],waa6:21,waan2:21,wai3:20,wai:[2,20,23],well:[1,2,20],whatev:21,when:[0,1,2,17,21],whenev:20,where:[0,1,5,10,20,21,23],whether:[17,23],which:[0,3,8,10,11,17,19,20,21,22],whitespac:1,who:16,whose:[0,16],wide:[2,21],window:[1,21],within:[0,4,21],without:2,wonder:16,wong:2,word:[0,1,2,4,5,10,13,14,15,16,17,19],word_freq:20,word_frequ:[0,20],word_ngram:[0,20],word_rang:[0,4,21],word_segment:[13,23],wordlist:1,wordseg:1,work:[0,2,12,19,20,21,23],workshop:18,worth:19,would:[0,2,8,11,17,19,23],wouldn:19,wrap:0,write:21,written:23,x2y:1,x_to_i:1,xml:2,xxa:20,xxb:20,xxx:[0,12],yale:[1,8],year:[0,2],yesterdai:[10,19],yip:2,yipmatthew:[2,12],you:[0,2,12,17,19,20,21,22,23],your:[0,2,3,16,19,21,22],yue:20,yuen:16,yut:17,zan1hai6:21,zero:21,zik6cing4:21,zip:[0,2,12],zoek2:21,zyu6:21},titles:["API Reference","Changelog","Corpus Data","pycantonese.CHATReader","pycantonese.CHATReader.search","pycantonese.characters_to_jyutping","pycantonese.hkcancor","pycantonese.jyutping_to_tipa","pycantonese.jyutping_to_yale","pycantonese.parse_jyutping","pycantonese.pos_tag","pycantonese.pos_tagging.hkcancor_to_ud","pycantonese.read_chat","pycantonese.segment","pycantonese.stop_words","pycantonese.word_segmentation.Segmenter","PyCantonese: Cantonese Linguistics and NLP in Python","Jyutping Romanization","Research Outputs","Part-of-Speech Tagging","Corpus Reader Methods","Corpus Search Queries","Stop Words","Word Segmentation"],titleterms:{"0dev":1,"2014":1,"2015":1,"2016":1,"2018":1,"2020":1,"2021":1,"break":1,Added:1,acknowledg:16,annot:20,api:[0,1],built:2,cantones:16,chang:1,changelog:1,charact:[17,20,21],characters_to_jyutp:5,chat:2,chatread:[0,3,4],child:2,chines:[20,21],cite:16,content:16,convers:17,corpu:[0,2,20,21],criteria:21,custom:[2,23],data:[0,2],deprec:1,download:16,element:21,exampl:16,fix:1,format:[2,21],frequenc:20,header:20,hkcancor:6,hkcancor_to_ud:11,how:16,instal:16,jyutp:[0,17,20,21],jyutping_to_tipa:7,jyutping_to_yal:8,languag:0,licens:16,linguist:16,link:16,logo:16,method:20,multipl:21,natur:0,ngram:20,nlp:16,non:1,output:[18,21],pars:17,parse_jyutp:9,part:[19,21],pos_tag:[10,11],process:0,pycantones:[3,4,5,6,7,8,9,10,11,12,13,14,15,16],python:16,queri:21,quick:16,rang:21,read_chat:12,reader:20,refer:0,remov:1,research:18,result:21,roman:[0,17,20],search:[4,21],secur:1,segment:[13,15,23],speech:[19,21],stop:22,stop_word:14,string:17,tabl:16,tag:[19,21],talkbank:2,tipa:17,token:0,transcript:20,unreleas:1,utter:21,word:[20,21,22,23],word_segment:15,yale:17}}) \ No newline at end of file +Search.setIndex({docnames:["api","changelog","data","generated/pycantonese.CHATReader","generated/pycantonese.CHATReader.search","generated/pycantonese.characters_to_jyutping","generated/pycantonese.hkcancor","generated/pycantonese.jyutping_to_tipa","generated/pycantonese.jyutping_to_yale","generated/pycantonese.parse_jyutping","generated/pycantonese.pos_tag","generated/pycantonese.pos_tagging.hkcancor_to_ud","generated/pycantonese.read_chat","generated/pycantonese.segment","generated/pycantonese.stop_words","generated/pycantonese.word_segmentation.Segmenter","index","jyutping","papers","pos_tagging","reader","searches","stop_words","word_segmentation"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":3,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":2,"sphinx.domains.rst":2,"sphinx.domains.std":1,"sphinx.ext.intersphinx":1,"sphinx.ext.viewcode":1,sphinx:56},filenames:["api.rst","changelog.rst","data.rst","generated/pycantonese.CHATReader.rst","generated/pycantonese.CHATReader.search.rst","generated/pycantonese.characters_to_jyutping.rst","generated/pycantonese.hkcancor.rst","generated/pycantonese.jyutping_to_tipa.rst","generated/pycantonese.jyutping_to_yale.rst","generated/pycantonese.parse_jyutping.rst","generated/pycantonese.pos_tag.rst","generated/pycantonese.pos_tagging.hkcancor_to_ud.rst","generated/pycantonese.read_chat.rst","generated/pycantonese.segment.rst","generated/pycantonese.stop_words.rst","generated/pycantonese.word_segmentation.Segmenter.rst","index.rst","jyutping.rst","papers.rst","pos_tagging.rst","reader.rst","searches.rst","stop_words.rst","word_segmentation.rst"],objects:{"pycantonese.CHATReader":{__init__:[3,1,1,""],ages:[0,1,1,""],append:[0,1,1,""],append_left:[0,1,1,""],characters:[0,1,1,""],clear:[0,1,1,""],dates_of_recording:[0,1,1,""],extend:[0,1,1,""],extend_left:[0,1,1,""],file_paths:[0,1,1,""],from_dir:[0,1,1,""],from_files:[0,1,1,""],from_strs:[0,1,1,""],from_zip:[0,1,1,""],headers:[0,1,1,""],ipsyn:[0,1,1,""],jyutping:[0,1,1,""],languages:[0,1,1,""],mlu:[0,1,1,""],mlum:[0,1,1,""],mluw:[0,1,1,""],n_files:[0,1,1,""],participants:[0,1,1,""],pop:[0,1,1,""],pop_left:[0,1,1,""],search:[4,1,1,""],sents:[0,1,1,""],tagged_sents:[0,1,1,""],tagged_words:[0,1,1,""],tokens:[0,1,1,""],ttr:[0,1,1,""],utterances:[0,1,1,""],word_frequencies:[0,1,1,""],word_ngrams:[0,1,1,""],words:[0,1,1,""]},"pycantonese.corpus":{Token:[0,0,1,""]},"pycantonese.jyutping":{Jyutping:[0,0,1,""]},"pycantonese.jyutping.Jyutping":{"final":[0,1,1,""],__str__:[0,1,1,""]},"pycantonese.pos_tagging":{hkcancor_to_ud:[11,2,1,""]},"pycantonese.word_segmentation":{Segmenter:[15,0,1,""]},"pycantonese.word_segmentation.Segmenter":{__init__:[15,1,1,""]},pycantonese:{CHATReader:[3,0,1,""],characters_to_jyutping:[5,2,1,""],hkcancor:[6,2,1,""],jyutping_to_tipa:[7,2,1,""],jyutping_to_yale:[8,2,1,""],parse_jyutping:[9,2,1,""],pos_tag:[10,2,1,""],read_chat:[12,2,1,""],segment:[13,2,1,""],stop_words:[14,2,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:function"},terms:{"000":19,"001":20,"001_v2":20,"100":[10,11,19,22],"104":[1,14,22],"105":[14,22],"107":22,"1177307":2,"127":[],"12715":21,"13251":20,"134":20,"140":20,"150":19,"153654":2,"160":2,"161":2,"167":20,"16730":2,"1681":21,"178270":2,"186":20,"190":2,"1949480":2,"195":21,"197":17,"1997":20,"1st":21,"2015":[16,18],"2016":18,"202":20,"2020":[16,17,23],"2021":2,"209":20,"21167":21,"219":20,"22328":21,"223415":2,"2259":20,"2570":20,"2734":20,"2741":20,"2755":20,"29012":[],"2911":20,"29726":[16,21],"29954":21,"30th":20,"3rd":18,"4110":20,"501":2,"5019":20,"520":20,"527":20,"533877":2,"70438":2,"705":21,"9282":20,"\u4e00\u5572":[14,22],"\u4e00\u5b9a":[14,22],"\u4e03":21,"\u4e0d\u5982":[14,22],"\u4e0d\u904e":[14,21,22],"\u4e5d\u9f8d":22,"\u4f4f":21,"\u4f60":[20,21],"\u4f62":20,"\u4fc2":[16,20,21],"\u505c\u7528\u8a5e":16,"\u505c\u7528\u8bcd":16,"\u516b\u6708":21,"\u5187\u5f97":[16,21],"\u5206\u8a5e":16,"\u5206\u8bcd":16,"\u53bb":[16,20,21],"\u53ef\u4ee5":21,"\u5416":20,"\u5462":20,"\u54aa":21,"\u5514":[13,20,23],"\u5514\u4fc2":21,"\u5514\u8a72":17,"\u554a":[20,21],"\u5572":20,"\u5582":20,"\u5587":[20,21],"\u558e":20,"\u55ce":20,"\u55f0":[10,19],"\u55f0\u500b":21,"\u55f0\u908a":21,"\u55f1":21,"\u5649":20,"\u565a\u65e5":[10,19],"\u56d6":[20,21],"\u597d":[16,17,21],"\u5b78":[13,16,23],"\u5bb9":[13,23],"\u5bb9\u5514\u5bb9\u6613":[13,23],"\u5bb9\u6613":[13,23],"\u5c0d":[10,19],"\u5c31":20,"\u5e7e":21,"\u5e7f\u4e1c\u8bdd":16,"\u5ee3\u6771":23,"\u5ee3\u6771\u8a71":[5,7,8,9,13,16,17,20,23],"\u5ee3\u6771\u8a71\u597d\u96e3\u5b78":16,"\u5ee3\u6771\u8a71\u5bb9\u5514\u5bb9\u6613\u5b78":[13,23],"\u6211":[10,19,20],"\u6211\u565a\u65e5\u8cb7\u55f0\u5c0d\u978b":19,"\u62b5":21,"\u65b0\u754c":22,"\u65c5":20,"\u65c5\u884c":[16,20,21],"\u65e5":21,"\u6709\u5187":[16,21],"\u6709\u5f97":[16,21],"\u6a5f":21,"\u6a5f\u7968":21,"\u6c23\u5019":[8,17],"\u6de1\u5b63":21,"\u73a9":21,"\u76f4\u7a0b":21,"\u771f\u4fc2":21,"\u7793\u89ba":21,"\u789f":21,"\u789f\u5f62":21,"\u7ca4\u62fc":16,"\u7ca4\u8bed":16,"\u7cb5":16,"\u7cb5\u62fc":16,"\u7cb5\u8a9e":16,"\u7da0":21,"\u8072\u6bcd":21,"\u807d":21,"\u81ea\u7136\u8a9e\u8a00\u8655\u7406":16,"\u81ea\u7136\u8bed\u8a00\u5904\u7406":16,"\u86cb":17,"\u86cb\u7cd5":17,"\u884c":20,"\u8981":[16,21],"\u8a5e\u6027\u6a19\u6ce8":16,"\u8a71":[21,23],"\u8a92":20,"\u8a9e\u8a00\u5b78":16,"\u8b1b":[5,16,17,21],"\u8b8a\u97f3":17,"\u8bcd\u6027\u6807\u6ce8":16,"\u8bed\u8a00\u5b66":16,"\u8cb7":[10,19],"\u8cca":21,"\u8ddf":21,"\u8fea\u58eb\u5c3c":21,"\u904e":21,"\u9072":20,"\u90fd":21,"\u96c0":21,"\u96e3":16,"\u978b":[10,19],"\u97fb\u6bcd":[17,21],"\u98db\u6a5f":21,"\u9999\u6e2f":[14,22],"\u9999\u6e2f\u4eba":[5,16,17],"\u9999\u6e2f\u4eba\u8b1b\u5ee3\u6771\u8a71":[5,16,17],"\u9999\u6e2f\u5cf6":22,"\ud842\udfa9\ud843\ude4c":1,"\ud842\udfa9\ud843\ude4c":[],"\ud843\udd15":1,"\ud843\udd15":[],"\ud843\ude9d":1,"\ud843\ude9d":[],"\ud843\udea2":[1,20],"\ud843\udea2":[],"\ud843\uded7":1,"\ud843\uded7":[],"\ud844\udc14":1,"\ud844\udc14":[],"\ud844\udc5c":1,"\ud844\udc5c":[],"\ud844\udcc9":[20,21],"\ud844\udcc9":[],"\ud844\udcd3":1,"\ud844\udcd3":[],"\ud854\ude99":1,"\ud854\ude99":[],"\ud85d\udd74":1,"\ud85d\udd74":[],"case":[0,3,17,21,22],"char":5,"class":[0,1,3,13,15,17,20,23],"computational linguist":16,"d\u016bng":[8,17],"default":[0,1,4,8,10,12,13,19,21,22,23],"final":[0,1,4,17,21],"float":0,"function":[1,2,5,7,8,11,17,19,20,21,22,23],"g\u014di":17,"gw\u00f3ng":[8,17],"gw\u00f3ngd\u016bngw\u00e1":[8,17],"import":[2,13,16,17,19,20,21,22,23],"int":[0,4,15,21],"natural language process":16,"new":[0,1,5,7,8,10,11,13,14,15,22],"null":1,"part-of-speech tag":16,"return":[0,1,4,5,6,7,8,9,10,11,12,13,14,17,20,21,22,23],"stop word":16,"super":[7,17],"switch":[1,8],"true":[0,1,4,8,14,20,21,22],"while":[17,20,21],"word segment":16,Added:13,Eve:[0,12],For:[0,2,4,11,12,20,21],Its:19,One:[0,4],POS:10,Such:0,The:[0,1,2,4,5,10,12,13,16,17,18,19,20,21,22,23],There:10,Used:1,With:[16,21],__init__:[3,15],__str__:0,__version__:16,aa3:[20,21],abil:17,abl:[2,17],about:[19,22],abov:[1,21],accept:19,access:[1,16,17,20,21],accommod:20,acquisit:[0,2,3],adam:[0,12],add:[14,17,22],addit:20,adjust:21,adopt:[1,2],adult:20,adv:[10,19],after:[0,4,20],age:[0,16,18,20],ages:[0,20],albino:16,algorithm:23,all:[0,1,2,4,16,20,21,22],all_verb:[16,21],allow:[1,10,13,15,21,23],allow_remot:0,alon:21,alphabet:[2,16],alreadi:[0,19,21],also:[17,20,21,22],alwai:21,ambigu:[8,17],american:[0,12],among:17,analysi:21,ani:[0,5,11,17,21],annot:[1,2,11,16,19,21],anonym:20,anoth:[0,17],anyth:2,api:16,append:0,append_left:0,appli:[0,1,10,12],applic:[0,3,11],approach:[13,19],appropri:[0,20],apr:20,april:20,arbitrari:0,argument:[1,10,13,17,19,20,22,23],aris:17,around:[20,21],artist:16,as_list:[1,8,17],ask:21,associ:[2,20],attempt:0,attribut:[0,17,20],audio:20,augment:20,author:16,automat:[17,21],avail:[1,2,20],averag:[10,19],baat3jyut6:21,back:1,ban:23,base:[5,17,21],basic:[1,19],bat1gwo3:21,bear:21,becaus:[21,23],becom:0,been:[0,1,2,16,19,20],befor:[0,1,4],begin:[0,12,16,20,21],behavior:[13,17],being:8,below:[1,20],benefit:11,better:1,between:[8,17,20],beyond:2,big:[16,18],bile:[2,12],bilingu:[2,16,18],bool:[0,4,8],both:[0,1,2,20],bought:[10,19],boundari:[8,23],brown:[0,12],bug:[1,16],build:1,built:[1,20,21],bump:1,by_fil:[0,4,20],by_token:[0,1,4,21],by_utter:[0,1,4,20,21],caak2:21,cake:17,call:[17,20,21],can:[0,2,11,12,17,20,21,22,23],cannot:0,cantones:[0,1,2,3,4,5,6,7,8,9,10,12,13,14,17,18,19,20,21,22,23],cap:1,capabl:[20,21],capit:0,cat1:21,cathug:16,centr:[16,18],certain:23,cha:[0,2,12,20],chang:[5,8,13,17,21],changelog:16,chao:17,charact:[0,1,4,5,7,8,9,10,13,16,23],character_s:[0,1,3],characters2jyutp:[1,5],characters_to_jyutp:[1,16,17],charl:16,chat:[0,1,3,12,20],chatread:[1,2,6,12,20],chcc:2,check:2,chen:[16,18],chi:[0,20],child:[0,12,20],childhood:[16,18],children:[0,12],chim:16,chines:[0,2,4,16,17,18],ci4:20,circleci:1,classmethod:[0,2],clear:0,climat:[8,17],cls:[13,23],coda:[0,1,4,9,16,17,21],codas_ptk:21,code:[1,2,9,16,20,21],collaps:0,collect:[0,20],colloc:21,com:16,combin:[0,19,21],come:[0,1,2,16,19],common:[17,21],commonli:17,compl:[10,11],complet:21,compon:17,comput:18,concurr:23,confus:17,conson:[8,17],constrain:23,constraint:21,consult:2,contact:16,contain:[0,2,4,12,13,17,21,23],contextu:17,contrast:20,contribut:16,control:[2,20,23],conveni:[2,20],convent:[0,1,20,23],convers:[1,2,5,16,21],convert:[1,2,5,7,8,10,17],corpora:2,corpu:[1,3,5,6,12,13,16,17,19,23],correct:17,correspond:[0,20],count:20,counter:[0,20],counterpart:[1,2],cover:1,creat:[6,20],criteria:[0,4],criterion:[1,2],cross:[11,19],current:[0,2,16,17,19,20,23],custom:[1,13,20,21],customiz:[1,15],daam6gwai3:21,daan2:17,daan6gou1:17,dai2:21,dai:0,data:[1,4,5,10,11,12,13,16,17,18,19,20,21,22,23],dataset:[0,1,2,12,16,20],date:[0,20],dates_of_record:0,datetim:[0,20],deal:20,dedic:20,defin:[1,20],demograph:20,depend:[1,10,11,17,19,22,23],deprec:[0,4,5,7,8],describ:[0,10,11,19,21],design:[2,16,21],detail:[1,16,20],detect:17,determin:[0,22],develop:18,di1:20,dict:[0,11,20],dictioari:11,dictionari:[11,17],differ:[20,21],difficult:16,dik6si6nei4:21,dip2:21,dip2jing4:21,direct:[0,1],directli:17,directori:[0,2,12],disabl:[0,1,4],disallow:[1,13,15,23],disambigu:[8,17],discours:21,dist:17,distinct:0,distribut:21,doc:1,docstr:1,document:[1,2,20],doe:[0,12,13],domain:1,domin:0,done:0,dou1:21,download:[0,2],drive:2,drop:[0,1],due:[17,19],duplic:0,dut2:21,each:[0,1,5,10,21],easi:[13,17,23],edu:[10,11],educ:20,egg:17,either:[0,12,22],element:[0,1,7,8,9,20],email:16,empti:3,encod:[0,1,12],end:[17,23],eng:[0,20],english:[0,12,20,22,23],enough:17,entir:[11,23],entri:1,equival:[0,1,4,5,7,8,17,21],error:[1,17],especi:19,etc:[16,20,21],european:0,even:0,exampl:[0,4,5,7,8,9,10,11,12,13,14,17,20,21,22],exclud:[0,1,4,12,20],exist:[0,23],expect:19,explicitli:23,expos:[1,17,19],express:[0,12,16,21],extend:0,extend_left:0,extens:0,facebook:16,fact:19,fals:[0,1,4,8,14,17,21],fan3gaau3:21,fat:20,father:20,favor:1,featur:16,feedback:16,fei1gei1:21,femal:20,file:[0,1,2,3,4,12,17,20],file_path:0,filenam:20,filter:[0,12,20,22],find:[16,21],fine:2,first:[0,20,21],five:[13,23],flavor:20,flexibl:21,folder:1,follow:[0,1,2,12,21,23],forc:[0,1],form:[0,1,4,23],format:[1,16,20],found:[20,21,23],frequenc:0,from:[0,1,2,3,8,10,11,12,13,16,17,19,20,21,22,23],from_dir:[0,2],from_fil:[0,2],from_str:[0,2],from_zip:[0,2],full:21,further:[17,19,20],futur:17,gaa3:21,gan1:21,gei1:21,gei1piu3:21,gei2:21,gender:20,gener:[0,1,2,4,21],get:[16,17,19,20],github:[1,16],give:21,given:[0,1,2,4,11,20,21],go2bin1:21,go2go3:21,gong2:[5,16,17,21],good:17,gra:[0,16,20,21],grab:21,grain:2,grammat:0,granular:11,group:20,guthri:2,gwo3:21,gwong2dung1waa2:[5,7,8,9,16,17],hai6:[16,21],han:16,handl:[0,2,12,16,17,20],handout:18,has:[1,2,8,16,17,19,20,22,23],hauh:[8,17],have:[0,1,2,16,17,19,20,21],header:0,hei3hau6:[8,17],hei6au6:[8,17],hei:[8,17],heihauh:[8,17],helper:19,heoi3:[16,20,21],here:[0,2,12,20,21,23],heritag:2,high:[17,21],hill:16,him:[16,18],hkcancor:[1,2,5,10,11,13,16,17,19,20,21,23],hkcancor_to_ud:[1,10,19],hku:2,ho2ji5:21,hoeng1gong2jan4:[5,16,17],hong:[1,2,6,16,18,20,22],hongkong:[5,16,17],hood:2,hou2:[17,21],hou7:17,how:[20,21],howev:23,hss:[10,11],html:[0,4,10,11],http:[0,1,2,4,10,11,12,16],hyperlink:20,ident:20,identifi:[0,20],ids:0,ignor:[0,4,21],illeg:[7,8,9],illustr:20,implement:[1,16,20,23],improv:1,includ:[0,1,4,13,16,17,20,21,22,23],inconveni:17,incorpor:[2,16],independ:[17,20],index:[0,10,11],indic:0,individu:[0,1,4,5,16,17],inform:[0,20,21],ingest:17,inherit:[0,3,20],initi:[0,1,3,4,15,21],innov:18,input:[1,10,11,13,19],instagram:16,instanc:[0,20,21,23],instanti:0,instead:[0,1,4,17],integ:23,intellig:17,intention:0,interest:[0,17,21],intern:[1,10],internet:0,interpret:16,introduc:[16,18,20],intuit:[],inv:0,invalid:17,investig:19,involv:[17,21],ipsyn:0,island:22,issu:[1,16],issubset:22,iter:[0,4,14,15,21,22,23],its:[1,2,8,10,16,17,20,21],jackson:[16,18],jacksonlle:16,jat6:21,jau5dak1:[16,21],jau5mou5:[16,21],jenni:16,jiu3:[16,21],jp_str:[7,8,9],json:23,just:[0,4,20,21],jyut6:17,jyutp:[1,2,3,4,5,7,8,9,16],jyutping2tipa:[1,7],jyutping2yal:[1,8],jyutping_s:[0,1,3],jyutping_to_tipa:[1,17],jyutping_to_x:[1,8],jyutping_to_yal:[1,17],jyutpingi1:[],keep:21,keep_cas:0,kept:0,keyword:[1,13,17,19,23],kind:21,known:[],kong:[1,2,6,16,18,20,22],kowloon:22,koy55:17,kwarg:23,laa1:21,laa4:21,lai:16,lam:16,languag:[2,3,11,16,17,19,20,22,23],last:[0,16,17],latex:[7,17],learn:[13,16,23],leav:0,lee:[2,16,18],leewongleung:2,left:[0,4,21],len:[0,2,14,16,21,22],length:[0,1,13,15,23],leo:2,leoi5hang4:[16,20,21],less:11,let:20,letter:[0,17,20,21],leung:2,level:[0,17,20,21],lib:17,librari:[2,13,16,17,23],licens:[1,2,17,19,23],like:[0,2,19,21,22,23],likewis:21,limit:19,line:[1,8,17,20],linguist:[2,11,18,19,21],link:1,list:[0,1,4,5,7,8,9,10,13,17,20,21],litong:[16,18],lo1:21,load:23,local:[0,2,12,17],longer:13,longest:[13,23],look:0,loop:21,low:[8,17],lowercas:0,luk2:21,m4goi1:17,m4hai6:21,machin:[2,21],mai6:21,mai:[0,3,17,21,22],maintein:16,major:[19,20],make:[0,17],mani:[0,2,20,22,23],manual:2,map:[1,11,19],march:[2,18],mark:5,marker:[8,17,20],match:[0,4,12,13,21,23],materi:20,matter:5,matthew:2,max_word_length:[15,23],maxim:13,maximum:[1,13,15,23],mean:[0,2,16,19,21],meaning:20,media:16,meet:2,memori:0,metadata:20,method:[0,1,3,15,16,21],might:[17,19],mit:16,mix:21,mlu:0,mlum:0,mluw:0,model:[1,2,5,10,13,15,17,19,23],modifi:16,modul:17,month:0,moon:[],mor:[0,16,20,21],more:[0,1,2,4,16,19,20,21,23],morphem:0,morpholog:[0,20],most:[0,16,17],most_common:20,mot:[0,20],mother:20,mou5dak1:[16,21],multipl:[7,8,9,17],n_file:[0,2],name:[1,16,20],nasal:[1,17],natur:[2,11,16,19,22,23],naturalist:2,necessari:[17,22,23],necessit:17,need:[2,17,21],nei5:[20,21],neighbor:21,neither:21,ngram:0,nltk:1,none:[0,4,5,11,12,13,14,15,16,17,20,21],nongra:[],nor:21,note:[1,16,18],noun:[10,19],now:[1,5],ntu:[10,11],nuclei:17,nucleu:[0,1,4,9,16,17,21],number:[0,2,16,17,20,21],number_of_charact:1,number_of_word:1,numer:17,numpydoc:1,object:[0,1,6,13,15,17,20,21,23],obtain:2,occurr:19,odd:19,off:19,offer:17,often:[22,23],ohio:18,oken:[],on25:[7,17],one:[0,1,2,4,7,8,9,10,12],ones:22,onli:[0,1,4,12,19,21],onset:[0,1,4,8,9,16,17,21],onward:1,open:1,option:[0,4,8,10,11,12,13,14,15,20,22],orb:1,order:[0,2,16,20],org:[0,1,2,4,10,11,12],organ:[0,4],origin:[1,2,10,11,16,19,20],orthograph:0,other:[0,1,2,8,17,21],otherwis:[0,4,17,21],out:2,output:[0,4,5,8,16,17,20],over:[10,11,19],overal:1,own:[2,20],packag:[0,1,17,20],page:20,paidocantones:2,paidologo:2,pair:[10,19],paramet:[0,1,4,5,7,8,9,10,11,12,13,14,15,21],parent:[0,3],pars:[0,1,2,9,12,16,20,21],parse_jyutp:[1,16,17],parser:1,part:[0,1,2,4,8,10,11,16,17,20],particip:[0,1,4,20],particl:21,particular:21,particularli:[2,17,20],pass:[0,12,23],path:[0,1,2,4,12],perceptron:[10,19],perform:21,perhap:19,permiss:[16,19],phonbank:2,phonolog:[0,4,17],phrase:[1,10,19],pick:1,piggyback:1,pin:1,pinjam:[17,21],pip:16,placehold:20,pleas:[0,2,4,16,17,19,20],plu:[0,1,4,20,21],point:[1,2,12,20],pop:0,pop_left:0,pos:[0,4,10,11,16,20,21],pos_tag:[1,19],possibl:[1,2,21],potenti:[17,19,23],power:21,pprint:[],preced:21,predict:10,preprocess:0,preserv:20,preval:[0,4],previou:[8,17],previous:[1,5],primer:2,print:[16,20,21],process:[11,16,17,19,20,22,23],product:0,pron:[10,19],pronoun:22,pronunci:17,properti:0,prove:17,provid:[0,2,11,12,13,17,19,21,22,23],ptk:21,ptk_tone2:21,publicli:2,punct:[10,19],punctuat:5,purpos:[1,11,21],pycantones:[0,1,2,17,18,19,20,21,22,23],pylangacq:[0,1,2,20],pypi:1,python3:17,python:[1,2,17,21],qualiti:[1,19],queri:[2,16],quot:[8,17],rachel:16,rais:[0,1,4,7,8,9,10,17],random:0,rang:[0,1,4],rather:1,ratio:0,read:[0,1,2,12,17],read_chat:2,readabl:2,reader:[0,1,3,16,17],readm:[2,16],readthedoc:1,reason:2,recent:[2,16,17],record:[0,20],recurs:[0,12],refer:16,regex:[0,4,21],regular:[0,12,16,21],rel:19,relat:[0,2],releas:[1,16,17,23],relev:17,remot:0,remov:[0,14,22],rendit:20,replac:[1,5,7,8],report:16,repr:17,repres:[0,1,5,17,21],represent:[0,17,20],request:16,requir:1,research:[2,16,21],resourc:[16,17],restructur:1,result:[0,1,16,17,19,23],retriev:17,revis:1,rich:2,richielo:16,right:[0,2,4,21],rime:[1,5,13,16,17,23],rime_cantones:16,rise:[17,21],robin:16,role:20,roman:[1,2,4,5,7,8,9,16,21],rst:1,ryan:16,rylanchiu:16,sai:20,same:[2,20,21,23],sarah:[0,12],satisfi:0,scheme:17,search:[0,1,2,16],second:21,see:[0,2,4,10,16,17,20,21],seem:17,segment:[0,1,2,4,5,10,16,17,19,20],semant:21,sens:0,sent:[0,1,4],sent_rang:[0,1,4],sentenc:[0,1,10,13,19],separ:17,septemb:[16,18],ses:20,session:20,set:[0,4,13,14,17,21,22],sever:[2,20],sex:20,shoe:[10,19],show:[16,21],similarli:22,simpl:23,simpli:[0,20],sinc:[0,2,12,19,20,21],singl:[2,8,12,21],situat:23,size:19,slide:[16,18,21],small:19,snippet:1,snowman:16,social:16,some:[0,3],some_token:20,sophist:19,sort:0,sourc:[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17],space:23,span:[0,4,21],speak:[5,16,17],speaker:20,specif:[1,19,20,21],specifi:[0,12,21,23],speech:[0,1,2,4,10,11,16,20],spot:20,stabl:16,standalon:17,standard:19,state:18,statist:19,statu:20,stdin:17,step:0,stephan:16,still:[17,19,23],stiller:16,stop:[1,14,16],stop_word:22,stop_words_1:[14,22],stop_words_2:[14,22],store:20,str:[0,4,5,7,8,9,10,11,12,13,14,15,17],straightforward:17,string:[0,1,4,5,8,10,12,13,20,21,22,23],strip:1,structur:[0,12,20],studi:[0,20,21],style:1,substanti:16,suffix:17,suit:1,suppli:1,support:[0,1,4,10],syllab:[1,17],syllabl:[8,17],syntax:[0,17,21],system:17,tag:[0,1,2,4,10,11,16,20],tagged_s:0,tagged_word:0,tagger:[10,19],tagset:[1,10,11,19,21],take:[1,2,17,19,21,22,23],talk:[16,18],talkbank:[0,12,20],tape:20,target:[0,20,21],task:[11,17,21,22,23],teach:17,teng1:21,term:[0,4,16],territori:22,test:[1,16],text:[0,1,4,12,20,21,23],than:[0,1,2,4,13,19,21,23],thank:[2,17],thei:[0,1,2],theme:1,thi:[0,1,2,5,7,8,10,11,12,13,15,16,17,20,21,23],think:[19,20,23],those:[0,2,17],though:23,three:20,through:[20,21],thrown:19,tier:20,time:[20,21],time_mark:20,tipa:[1,7],todai:17,togeth:0,token:[4,16,17,20,21],tone2:21,tone:[0,4,8,9,16,17,21],tool:[16,17,18,20],top:20,touch:19,traceback:17,track:21,tracker:16,tradit:[0,4],train:[10,13,19,23],transcrib:2,transcript:0,translat:20,transpar:1,travel:16,treat:[0,17,22],trigram:20,trivial:17,tsui:[16,18],tsz:[16,18],ttr:0,tun55:[7,17],tupl:[0,1,4,5,10,21],turn:[1,17],twitter:16,two:[17,20,21],txt:16,type:0,typeerror:10,typic:[0,20],ubiquit:20,unclear:8,under:[0,2,12],underli:[1,17,20,23],unicod:[0,12],union:0,univers:[1,10,11,16,18,19],universaldepend:[10,11],unrecogn:[7,8,9,11],unseen:[5,17],unseg:[1,10,13,19,23],unspecifi:[0,4],unzip:[0,2,12],updat:1,upgrad:16,url:[0,2,12],usag:20,use:[0,1,2,3,4,17,20,21,22],used:[0,2,4,10,13,17,20,21,23],useful:[2,21],usefulness:17,user:[1,17],uses:[2,10,11,19,20],using:[17,21],usr:17,usual:20,utf8:20,utf:[0,1,12],utter:[0,2,4,20],utterance_rang:[0,1,4,21],uuid:0,valid:17,valu:[0,1,5,8,12,17,21],valueerror:[0,4,7,8,9,10,17],varieti:21,variou:[1,17,20,21],verb:[10,11,16,19,21],version:[0,1,5,7,8,10,11,13,14,15,16,17,23],via:2,visual:20,wa25:[7,17],waa6:21,waan2:21,wai3:20,wai:[2,20,23],well:[1,2,20],whatev:21,when:[0,1,2,17,21],whenev:20,where:[0,1,5,10,20,21,23],whether:[17,23],which:[0,3,8,10,11,17,19,20,21,22],whitespac:1,who:16,whose:[0,16],wide:[2,21],window:[1,21],within:[0,4,21],without:2,wonder:16,wong:2,word:[0,1,2,4,5,10,13,14,15,16,17,19],word_freq:20,word_frequ:[0,20],word_ngram:[0,20],word_rang:[0,4,21],word_segment:[13,23],wordlist:1,wordseg:1,work:[0,2,12,19,20,21,23],workshop:18,worth:19,would:[0,2,8,11,17,19,23],wouldn:19,wrap:0,write:21,written:23,x2y:1,x_to_i:1,xml:2,xxa:20,xxb:20,xxx:[0,12],yale:[1,8],year:[0,2],yesterdai:[10,19],yip:2,yipmatthew:[2,12],you:[0,2,12,17,19,20,21,22,23],your:[0,2,3,16,19,21,22],yue:20,yuen:16,yut:17,zan1hai6:21,zero:21,zik6cing4:21,zip:[0,2,12],zoek2:21,zyu6:21},titles:["API Reference","Changelog","Corpus Data","pycantonese.CHATReader","pycantonese.CHATReader.search","pycantonese.characters_to_jyutping","pycantonese.hkcancor","pycantonese.jyutping_to_tipa","pycantonese.jyutping_to_yale","pycantonese.parse_jyutping","pycantonese.pos_tag","pycantonese.pos_tagging.hkcancor_to_ud","pycantonese.read_chat","pycantonese.segment","pycantonese.stop_words","pycantonese.word_segmentation.Segmenter","PyCantonese: Cantonese Linguistics and NLP in Python","Jyutping Romanization","Research Outputs","Part-of-Speech Tagging","Corpus Reader Methods","Corpus Search Queries","Stop Words","Word Segmentation"],titleterms:{"0dev":1,"2014":1,"2015":1,"2016":1,"2018":1,"2020":1,"2021":1,"break":1,Added:1,acknowledg:16,annot:20,api:[0,1],built:2,cantones:16,chang:1,changelog:1,charact:[17,20,21],characters_to_jyutp:5,chat:2,chatread:[0,3,4],child:2,chines:[20,21],cite:16,content:16,convers:17,corpu:[0,2,20,21],criteria:21,custom:[2,23],data:[0,2],deprec:1,download:16,element:21,exampl:16,fix:1,format:[2,21],frequenc:20,header:20,hkcancor:6,hkcancor_to_ud:11,how:16,instal:16,jyutp:[0,17,20,21],jyutping_to_tipa:7,jyutping_to_yal:8,languag:0,licens:16,linguist:16,link:16,logo:16,method:20,multipl:21,natur:0,ngram:20,nlp:16,non:1,output:[18,21],pars:17,parse_jyutp:9,part:[19,21],pos_tag:[10,11],process:0,pycantones:[3,4,5,6,7,8,9,10,11,12,13,14,15,16],python:16,queri:21,quick:16,rang:21,read_chat:12,reader:20,refer:0,remov:1,research:18,result:21,roman:[0,17,20],search:[4,21],secur:1,segment:[13,15,23],speech:[19,21],stop:22,stop_word:14,string:17,tabl:16,tag:[19,21],talkbank:2,tipa:17,token:0,transcript:20,unreleas:1,utter:21,word:[20,21,22,23],word_segment:15,yale:17}}) \ No newline at end of file diff --git a/docs/source/data.rst b/docs/source/data.rst index a180e5d..b22ce79 100644 --- a/docs/source/data.rst +++ b/docs/source/data.rst @@ -65,6 +65,34 @@ the CC BY-NC-SA 3.0 license. As of March 2021, the following Cantonese-related datasets are available from CHILDES and TalkBank (in alphabetical order): +.. invisible-code-block: python + + >>> import os + +.. skip: start if(os.getenv("CI") == "true", reason="certain CHILDES data pulls fail in some but not all python versions for unknown reasons") + +* `Child Heritage Chinese Corpus `_ + + .. code-block:: python + + >>> url = "https://childes.talkbank.org/data/Biling/CHCC.zip" + >>> corpus = pycantonese.read_chat(url) + >>> corpus.n_files() + 190 + >>> len(corpus.words()) + 533877 + +* `Guthrie Bilingual Corpus `_ + + .. code-block:: python + + >>> url = "https://childes.talkbank.org/data/Biling/Guthrie.zip" + >>> corpus = pycantonese.read_chat(url) + >>> corpus.n_files() + 36 + >>> len(corpus.words()) + 70438 + * `HKU-70 Corpus `_ .. code-block:: python @@ -76,12 +104,6 @@ available from CHILDES and TalkBank (in alphabetical order): >>> len(corpus.words()) 178270 -.. invisible-code-block: python - - >>> import os - -.. skip: start if(os.getenv("CI") == "true", reason="certain CHILDES data pulls fail in some but not all python versions for unknown reasons") - * `Lee-Wong-Leung Corpus `_ .. code-block:: python diff --git a/docs/stop_words.html b/docs/stop_words.html index dacf1f3..3c31210 100644 --- a/docs/stop_words.html +++ b/docs/stop_words.html @@ -356,7 +356,7 @@

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/docs/word_segmentation.html b/docs/word_segmentation.html index 1cd4bbf..2d33905 100644 --- a/docs/word_segmentation.html +++ b/docs/word_segmentation.html @@ -393,7 +393,7 @@

    Customizing Segmentation

    - © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 21, 2021 + © Copyright 2014-2021, Jackson L. Lee | Documentation last updated on March 23, 2021

    diff --git a/setup.py b/setup.py index 2877dc6..cf4e385 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ def main(): setup_requires="setuptools>=39", install_requires=[ "dataclasses ; python_version < '3.7'", - "pylangacq==0.13.0", + "pylangacq==0.13.1", "wordseg==0.0.2", ], package_data={