diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml deleted file mode 100644 index ea04a0e..0000000 --- a/.github/FUNDING.yml +++ /dev/null @@ -1,14 +0,0 @@ -# These are supported funding model platforms - -github: [tos-kamiya] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] -# patreon: # Replace with a single Patreon username -# open_collective: # Replace with a single Open Collective username -# ko_fi: # Replace with a single Ko-fi username -# tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel -# community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry -# liberapay: # Replace with a single Liberapay username -# issuehunt: # Replace with a single IssueHunt username -# otechie: # Replace with a single Otechie username -# lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry -# custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] -custom: ['https://www.amazon.jp/hz/wishlist/ls/3EMPATGCODYA3?ref_=wl_share'] diff --git a/.github/workflows/tests-windows.yaml b/.github/workflows/tests-windows.yaml deleted file mode 100644 index 44ea89e..0000000 --- a/.github/workflows/tests-windows.yaml +++ /dev/null @@ -1,28 +0,0 @@ -name: Tests - -on: [push] - -jobs: - build: - runs-on: ${{ matrix.platform }} - strategy: - max-parallel: 15 - matrix: - platform: [windows-latest] - python-version: ['3.8', '3.9', '3.10'] - - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2.2.2 - with: - python-version: ${{ matrix.python-version }} - - name: Install pip dependencies - run: | - python -m pip install --upgrade pip setuptools wheel - python -m pip install tox tox-gh-actions - python -m pip install docopt-ng - - name: Install the package under test - run: python -m pip install -e . - - name: Test with tox - run: tox diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 36832a7..24a7fba 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -9,7 +9,7 @@ jobs: max-parallel: 15 matrix: platform: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v2 @@ -21,7 +21,6 @@ jobs: run: | python -m pip install --upgrade pip setuptools wheel python -m pip install tox tox-gh-actions - python -m pip install docopt-ng - name: Install the package under test run: python -m pip install -e . - name: Test with tox diff --git a/README-pypi.md b/README-pypi.md index 22ef7d7..90b1d57 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -22,24 +22,11 @@ Features: ## Installation ```sh -pip install dendro-text +pipx install dendro-text ``` -If you run the dendro_text and get the following error message, please install dendro-text with docopt-ng. - -```sh -$ dendro_text -Error: the Docopt module has not been installed. Install it with `pip install docopt-ng`. -``` - -```sh -pip install dendro-text[docopt-ng] -``` - -(To make `dendro-text` compatible with both `docopt` and `docopt-ng`, dependencies on them are now explicitly extra dependencies.) - To uninstall, ```sh -pip uninstall dendro-text +pipx uninstall dendro-text ``` diff --git a/README.md b/README.md index 9e44bab..c6ab349 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -[![Tests](https://github.com/tos-kamiya/dendro_text/actions/workflows/tests.yaml/badge.svg)](https://github.com/tos-kamiya/dendro_text/actions/workflows/tests.yaml) +[![Tests](https://github.com/tos-kamiya/dendro-text/actions/workflows/tests.yaml/badge.svg)](https://github.com/tos-kamiya/dendro-text/actions/workflows/tests.yaml) -dendro_text +dendro-text =========== Draw a dendrogram of similarity between text files. @@ -22,46 +22,26 @@ Features: ## Installation ```sh -pip install dendro-text +pipx install dendro-text ``` -If you run the dendro_text and get the following error message, please install dendro-text with docopt-ng. - -```sh -$ dendro_text -Error: the Docopt module has not been installed. Install it with `pip install docopt-ng`. -``` - -```sh -pip install dendro-text[docopt-ng] -``` - -(To make `dendro-text` compatible with both `docopt` and `docopt-ng`, dependencies on them are now explicitly extra dependencies.) - To uninstall, ```sh -pip uninstall dendro-text +pipx uninstall dendro-text ``` ### Numba (option) **To enable jit compilation by Numba, install it according to the instructions on [Numba website](https://numba.pydata.org/).** -Note that the installation of Numba differs for each platform. For example, on Ubuntu 20.04, in addition to installing `numba` with pip: - -```sh -pip install numba -``` - -The following is required: +To install dendro-text with Numba, ```sh sudo apt install python3-testresources +pipx install dendro-text --preinstall numba ``` -Numba is used transparently. When you run `dendro_text`, if it detects that Numba is installed on your system, `dendro_text` will call functions compiled in jit, otherwise it will call pure Python functions. - The speedup with Numba was approx. 5x in one example I tried. ### picaf (option) @@ -71,7 +51,7 @@ If you are doing tasks like investigating files in the dendrogram one by one (as ## Usage ```sh -dendro_text ... +dendro-text ... ``` ### Options @@ -160,7 +140,7 @@ abccfg 2. Create dendrograms showing file similarity by character-by-character comparison. ```sh -$ dendro_text -c *.txt +$ dendro-text -c *.txt ─┬─┬─┬── abcfg.txt │ │ └── abcdfg.txt │ └─┬── abccfg.txt @@ -171,7 +151,7 @@ $ dendro_text -c *.txt 3. List files in order of similarity to a file `abccfg.txt`, with option `-N0`. ```sh -$ dendro_text -c -N0 abccfg.txt *.txt +$ dendro-text -c -N0 abccfg.txt *.txt 0 abccfg.txt 1 abcccfg.txt 1 abcdfg.txt @@ -190,7 +170,7 @@ Tokens that are only in the first file are indicated by a red background color, Note that the three files `abcccfg.txt`, `abccfg.txt`, and `abcfg.txt` are now grouped in one node, because they no longer differ. ```sh -$ dendro_text -c *.txt --prep 'sed s/c//g' +$ dendro-text -c *.txt --prep 'sed s/c//g' ─┬─┬── abcdfg.txt │ └── abcccfg.txt,abccfg.txt,abcfg.txt └── abdefg.txt @@ -202,7 +182,7 @@ $ dendro_text -c *.txt --prep 'sed s/c//g' The default tokenization (extracting words from the text) method is to split text at the point where the type of letter changes. -For example, the text "The version of dendro_text is marked as v1.1.1." turns into the following token sequence: +For example, the text "The version of dendro-text is marked as v1.1.1." turns into the following token sequence: ```sh ["The", " ", "version", " ", "of", " ", "dendro", "_", "text", " ", @@ -229,7 +209,7 @@ The base name of the temporary file is the same as the original input file, but For example, in the following command line, ```sh -$ dendro_text --prep p1.sh --prep p2.sh t1.txt t2.txt t3.txt +$ dendro-text --prep p1.sh --prep p2.sh t1.txt t2.txt t3.txt ``` Preprocessing scripts `p1.sh` and `p2.sh` will get (such as) `some/temp/dir/t1.txt`, `some/temp/dir/t2.txt` or `some/temp/dir/t3.txt` as input file. @@ -242,3 +222,7 @@ Preprocessing scripts `p1.sh` and `p2.sh` will get (such as) `some/temp/dir/t1.t * The file `Blocks.txt` is released under the [Unicode Data Files and Software License](https://www.unicode.org/license.txt). * All of the other source code is released under [the BSD 2-Clause License](LICENSE). + +## Changelog + +* v2.0.0: The script is renamed to `dendro-text`. Drop windows support. diff --git a/dendro_text/VERSION b/dendro_text/VERSION index f8a696c..227cea2 100644 --- a/dendro_text/VERSION +++ b/dendro_text/VERSION @@ -1 +1 @@ -1.7.2 +2.0.0 diff --git a/dendro_text/main.py b/dendro_text/main.py index 213e8a1..713cef0 100644 --- a/dendro_text/main.py +++ b/dendro_text/main.py @@ -1,5 +1,6 @@ from typing import Callable, List, Optional, Tuple, Union +import argparse import os.path import sys import tempfile @@ -7,13 +8,6 @@ import numpy as np from tqdm import tqdm -from init_attrs_with_kwargs import cast_set_attrs -from win_wildcard import get_windows_shell, SHELL_TO_EXPAND_WILDCARD_FUNC - -try: - from docopt import docopt -except ImportError as _e: - sys.exit("Error: The Docopt module has not installed. Install it with `pip install docopt-ng`.") from .dld import distance_int_list from .print_tree import print_tree, BOX_DRAWING_TREE_PICTURE_TABLE, BOX_DRAWING_TREE_PICTURE_TABLE_W_FULLWIDTH_SPACE @@ -182,65 +176,110 @@ def print_dendrogram(result, labels, format_leaf_node, max_depth=None, tree_pict ) -class Args: - file: List[str] - tokenize: bool - char_by_char: bool - line_by_line: bool - no_uniq_files: bool - show_words: bool - diff: bool - prep: List[str] - max_depth: Optional[int] - ascii_char_tree: bool - box_drawing_tree_with_fullwidth_space: bool - file_separator: Optional[str] - field_separator: Optional[str] - workers: Optional[int] - progress: bool - neighbors: Optional[int] - neighbor_list: Optional[int] - pyplot: bool - pyplot_font_names: bool - pyplot_font: Optional[str] - version: str - - -__doc__ = """Draw dendrogram of similarity among text files. - -Usage: - dendro_text [options] [-c|-l|-t] [-n NUM|-N NUM] [-a|-B] [--prep=PREPROCESSOR]... ... - dendro_text (-n NUM|-N NUM|-d) [-c|-l|-t] [--prep=PREPROCESSOR]... ... - dendro_text --show-words [-c|-l|-t] [--prep=PREPROCESSOR]... - dendro_text --pyplot-font-names - dendro_text --version - -Options: - -t --tokenize Compare texts as tokens of languages indicated by file extensions, using Pygments lexer. - -c --char-by-char Compare texts in a char-by-char manner. - -l --line-by-line Compare texts in a line-by-line manner. - -U --no-uniq-files Do not remove duplicates from the input files. - -d --diff Diff mode (Implies option -U). **Experimental.** - -W --show-words Show words extracted from the input file. - --prep=PREPROCESSOR Perform preprocessing for each input file. - -m --max-depth=DEPTH Flatten the subtrees (of dendrogram) deeper than this. - -a --ascii-char-tree Draw tree picture with ascii characters, not box-drawing characters. - -B --box-drawing-tree-with-fullwidth-space Draw tree picture with box-drawing characters and fullwidth space. - -s --file-separator=S File separator (default: comma). - -f --field-separator=S Separator of tree picture and file (default: tab). - -j NUM --workers=NUM Parallel execution. Number of worker processes. - --progress Show progress bar with ETA. - -n --neighbors=NUM Pick up NUM (>=1) neighbors of (files similar to) the first file. Drop the other files. - -N --neighbor-list=NUM List NUM neighbors of the first file, in order of increasing distance. `0` for +inf. - -p --pyplot Plot dendrogram with `matplotlib.pyplot` - --pyplot-font-names List font names can be used in plotting dendrogram. - --pyplot-font=FONTNAME Specify font name in plotting dendrogram. -""" +def gen_parser(): + parser = argparse.ArgumentParser( + description="Draw dendrogram of similarity among text files." + ) + + # Positional argument for ... + parser.add_argument( + 'files', nargs='*', help='Input text files to compare.' + ) + + # Mutually exclusive options for -c, -l, -t + group = parser.add_mutually_exclusive_group() + group.add_argument( + '-t', '--tokenize', action='store_true', + help='Compare texts as tokens of languages indicated by file extensions, using Pygments lexer.' + ) + group.add_argument( + '-c', '--char-by-char', action='store_true', + help='Compare texts in a char-by-char manner.' + ) + group.add_argument( + '-l', '--line-by-line', action='store_true', + help='Compare texts in a line-by-line manner.' + ) + + # Other options + parser.add_argument( + '-U', '--no-uniq-files', action='store_true', + help='Do not remove duplicates from the input files.' + ) + parser.add_argument( + '-d', '--diff', action='store_true', + help='Diff mode (Implies option -U). **Experimental.**' + ) + parser.add_argument( + '-W', '--show-words', action='store_true', + help='Show words extracted from the input file.' + ) + parser.add_argument( + '--prep', action='append', metavar='PREPROCESSOR', + help='Perform preprocessing for each input file.' + ) + parser.add_argument( + '-m', '--max-depth', type=int, metavar='DEPTH', + help='Flatten the subtrees (of dendrogram) deeper than this.' + ) + parser.add_argument( + '-a', '--ascii-char-tree', action='store_true', + help='Draw tree picture with ascii characters, not box-drawing characters.' + ) + parser.add_argument( + '-B', '--box-drawing-tree-with-fullwidth-space', action='store_true', + help='Draw tree picture with box-drawing characters and fullwidth space.' + ) + parser.add_argument( + '-s', '--file-separator', metavar='S', default=',', + help='File separator (default: comma).' + ) + parser.add_argument( + '-f', '--field-separator', metavar='S', default='\t', + help='Separator of tree picture and file (default: tab).' + ) + parser.add_argument( + '-j', '--workers', type=int, metavar='NUM', + help='Parallel execution. Number of worker processes.' + ) + parser.add_argument( + '--progress', action='store_true', + help='Show progress bar with ETA.' + ) + parser.add_argument( + '-n', '--neighbors', type=int, metavar='NUM', + help='Pick up NUM (>=1) neighbors of (files similar to) the first file. Drop the other files.' + ) + parser.add_argument( + '-N', '--neighbor-list', type=int, metavar='NUM', + help='List NUM neighbors of the first file, in order of increasing distance. `0` for +inf.' + ) + parser.add_argument( + '-p', '--pyplot', action='store_true', + help='Plot dendrogram with `matplotlib.pyplot`.' + ) + parser.add_argument( + '--pyplot-font-names', action='store_true', + help='List font names that can be used in plotting dendrogram.' + ) + parser.add_argument( + '--pyplot-font', metavar='FONTNAME', + help='Specify font name in plotting dendrogram.' + ) + parser.add_argument( + '--version', action='version', version="dendro-text %s" % __version__, + help='Show program\'s version number and exit.' + ) + + return parser def main(): - docopt_args = docopt(__doc__, version="dendro_text %s" % __version__) - args: Args = cast_set_attrs(Args(), **docopt_args) + parser = gen_parser() + args = parser.parse_args() + if not args.files: + parser.print_help() + return option_neighbor_list = args.neighbor_list if args.neighbor_list is not None else -1 if args.pyplot: @@ -272,14 +311,7 @@ def main(): else None ) - files = args.file - ws = get_windows_shell() - if ws is not None: - expand_func = SHELL_TO_EXPAND_WILDCARD_FUNC[ws] - r = [] - for f in files: - r.extend(expand_func(f)) - files = r + files = args.files if not (args.diff or args.no_uniq_files): files = uniq(files) diff --git a/pyproject.toml b/pyproject.toml index ddfff13..5cd1506 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,10 +11,11 @@ classifiers = [ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "License :: OSI Approved :: BSD License", "Operating System :: MacOS :: MacOS X", "Operating System :: POSIX :: Linux", - "Operating System :: Microsoft :: Windows", ] requires-python = ">=3.8" dependencies = [ @@ -23,8 +24,6 @@ dependencies = [ "pygments", # matplotlib # for option -p "tqdm", - "init-attrs-with-kwargs>=0.2.0", - "win-wildcard>=0.5.0", ] dynamic = ["version"] @@ -35,12 +34,8 @@ content-type = "text/markdown" [project.urls] Homepage = "https://github.com/tos-kamiya/dendro_text" -[project.optional-dependencies] -docopt-ng = ["docopt-ng"] -docopt = ["docopt"] - [project.scripts] -dendro_text = "dendro_text.main:main" +dendro-text = "dendro_text.main:main" [tool.setuptools] include-package-data = false @@ -58,15 +53,16 @@ find = {namespaces = false} version = {file = ["dendro_text/VERSION"]} [tool.tox.tox] -envlist = "py38, py39, py310" +envlist = "py38, py39, py310, py311, py312" [tool.gh-actions] python = """ 3.8: py38 3.9: py39 -3.10: py310""" +3.10: py310 +3.11: py311 +3.12: py312""" [tool.testenv] commands = """ -python -m pip install docopt-ng python -m unittest discover""" diff --git a/tests/test_N0.sh b/tests/test_N0.sh index e29009d..4ac1e58 100644 --- a/tests/test_N0.sh +++ b/tests/test_N0.sh @@ -7,7 +7,7 @@ for t in ab{c,cc,ccc,cd,de}fg.txt; do echo $t > $tmp_dir/$t done -dendro_text -c -f ' ' -N0 $tmp_dir/abccfg.txt $tmp_dir/*.txt | sed s+$tmp_dir/++g > $tmp_dir/result +dendro-text -c -f ' ' -N0 $tmp_dir/abccfg.txt $tmp_dir/*.txt | sed s+$tmp_dir/++g > $tmp_dir/result cat <<'EOS' | diff $tmp_dir/result - 0 abccfg.txt diff --git a/tests/test_N3.sh b/tests/test_N3.sh index 9b0ed9d..4c701a2 100644 --- a/tests/test_N3.sh +++ b/tests/test_N3.sh @@ -7,7 +7,7 @@ for t in ab{c,cc,ccc,cd,de}fg.txt; do echo $t > $tmp_dir/$t done -dendro_text -c -f ' ' -N3 $tmp_dir/abcccfg.txt $tmp_dir/*.txt | sed s+$tmp_dir/++g > $tmp_dir/result +dendro-text -c -f ' ' -N3 $tmp_dir/abcccfg.txt $tmp_dir/*.txt | sed s+$tmp_dir/++g > $tmp_dir/result cat <<'EOS' | diff $tmp_dir/result - 0 abcccfg.txt diff --git a/tests/test_a.sh b/tests/test_a.sh index 4dacb9a..d438311 100644 --- a/tests/test_a.sh +++ b/tests/test_a.sh @@ -7,7 +7,7 @@ for t in ab{c,cc,ccc,cd,de}fg.txt; do echo $t > $tmp_dir/$t done -dendro_text -c -f ' ' -a $tmp_dir/*.txt | sed s+$tmp_dir/++g > $tmp_dir/result +dendro-text -c -f ' ' -a $tmp_dir/*.txt | sed s+$tmp_dir/++g > $tmp_dir/result cat <<'EOS' | diff $tmp_dir/result - -+-+-+-- abcfg.txt diff --git a/tests/test_identical_files.sh b/tests/test_identical_files.sh index f816328..3cfe7e0 100644 --- a/tests/test_identical_files.sh +++ b/tests/test_identical_files.sh @@ -9,7 +9,7 @@ done echo iji > $tmp_dir/ij.txt -dendro_text -f ' ' -a $tmp_dir/*.txt | sed s+$tmp_dir/++g > $tmp_dir/result +dendro-text -f ' ' -a $tmp_dir/*.txt | sed s+$tmp_dir/++g > $tmp_dir/result cat <<'EOS' | diff $tmp_dir/result - -+-- ij.txt