feat: Split markdown files when larger than max issue body size

Signed-off-by: Zack Koppert <[email protected]>
github · May 1, 2024 · a26168d · a26168d
1 parent d40d483
commit a26168d
Show file tree

Hide file tree

Showing 6 changed files with 146 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
 # Output files
-issue_metrics.md
+issue_metrics*.md
 issue_metrics.json
 
 # Byte-compiled / optimized / DLL files

diff --git a/README.md b/README.md
@@ -161,6 +161,7 @@ This action can be configured to authenticate with GitHub App Installation or Pe
 - [Configuring the `SEARCH_QUERY`](./docs/search-query.md)
 - [Local usage without Docker](./docs/local-usage-without-docker.md)
 - [Authenticating with GitHub App Installation](./docs/authenticating-with-github-app-installation.md)
+- [Dealing with large issue_metrics.md files](./docs/dealing-with-large-issue-metrics.md)
 
 ## Contributions
 

diff --git a/docs/dealing-with-large-issue-metrics.md b/docs/dealing-with-large-issue-metrics.md
@@ -0,0 +1,16 @@
+# Dealing with large issue metrics markdown files
+
+When working with lots of issues/pull requests/discussion results, the resulting issue_metrics.md  file can become very large. This can cause the GitHub API to return an error when trying to create an issue with the contents of the file.
+
+```shell
+Pull request creation failed. Validation failed: Body is too long (maximum is 65536 characters)
+```
+
+To work around this limitation, the issue-metrics action detects the issue and splits the issue_metrics.md file into smaller files. So instead of issue_metrics.md, you will get issue_metrics_0.md, issue_metrics_1.md, etc. Since we don't want the action to fail, it has been designed to have the same name as usual for the first split file (issue_metrics.md) and then append a number to the name for the subsequent split files.
+
+You can choose one of the following strategies to deal with the split files:
+- Create multiple issues, each with using the next split file in the sequence.
+- Upload the full file as an artifact and link to it in the issue body.
+- Create an issue and put the content of the split files as issue comments.
+
+JSON output files are not split since its not anticipated that you use them as issue body content.
diff --git a/issue_metrics.py b/issue_metrics.py
@@ -20,6 +20,7 @@
     main(): Run the issue-metrics script.
 """
 
+import shutil
 import sys
 from typing import List, Union
 
@@ -30,6 +31,7 @@
 from discussions import get_discussions
 from json_writer import write_to_json
 from labels import get_label_metrics, get_stats_time_in_labels
+from markdown_helpers import markdown_too_large_for_issue_body, split_markdown_file
 from markdown_writer import write_to_markdown
 from most_active_mentors import count_comments_per_user, get_mentor_count
 from time_to_answer import get_stats_time_to_answer, measure_time_to_answer
@@ -364,6 +366,7 @@ def main():
         num_mentor_count,
         search_query,
     )
+
     write_to_markdown(
         issues_with_metrics,
         stats_time_to_first_response,
@@ -377,6 +380,18 @@ def main():
         search_query,
     )
 
+    max_char_count = 65535
+    if markdown_too_large_for_issue_body("issue_metrics.md", max_char_count):
+        split_markdown_file("issue_metrics.md", max_char_count)
+        shutil.move("issue_metrics.md", "issue_metrics_full.md")
+        shutil.move("issue_metrics_0.md", "issue_metrics.md")
+        print(
+            "Issue metrics markdown file is too large for GitHub issue body and has been \
+            split into multiple files. ie. issue_metrics.md, issue_metrics_1.md, etc. \
+            The full file is saved as issue_metrics_full.md\n\
+            See https://github.com/github/issue-metrics/blob/main/docs/dealing-with-large-issue-metrics.md"
+        )
+
 
 if __name__ == "__main__":
     main()
diff --git a/markdown_helpers.py b/markdown_helpers.py
@@ -0,0 +1,38 @@
+""" Helper functions for working with markdown files. """
+
+
+def markdown_too_large_for_issue_body(file_path: str, max_char_count: int) -> bool:
+    """
+    Check if the markdown file is too large to fit into a github issue.
+
+    Inputs:
+    file_path: str - the path to the markdown file to check
+    max_char_count: int - the maximum number of characters allowed in a github issue body
+
+    Returns:
+    bool - True if the file is too large, False otherwise
+
+    """
+    with open(file_path, "r", encoding="utf-8") as file:
+        file_contents = file.read()
+        return len(file_contents) > max_char_count
+
+
+def split_markdown_file(file_path: str, max_char_count: int) -> None:
+    """
+    Split the markdown file into smaller files.
+
+    Inputs:
+    file_path: str - the path to the markdown file to split
+    max_char_count: int - the maximum number of characters allowed before splitting markdown file
+
+    """
+    with open(file_path, "r", encoding="utf-8") as file:
+        file_contents = file.read()
+        contents_list = [
+            file_contents[i : i + max_char_count]
+            for i in range(0, len(file_contents), max_char_count)
+        ]
+        for i, content in enumerate(contents_list):
+            with open(f"{file_path[:-3]}_{i}.md", "w", encoding="utf-8") as new_file:
+                new_file.write(content)
diff --git a/test_markdown_helpers.py b/test_markdown_helpers.py
@@ -0,0 +1,75 @@
+""" Unit tests for the markdown_helpers module. """
+
+import os
+import unittest
+
+from markdown_helpers import markdown_too_large_for_issue_body, split_markdown_file
+
+
+class TestMarkdownHelpers(unittest.TestCase):
+    """
+    Unit tests for the markdown_helpers module.
+    """
+
+    def test_markdown_too_large_for_issue_body(self):
+        """
+        Test the markdown_too_large_for_issue_body function.
+        """
+        # Define a sample markdown file content
+        max_char_count = 65535
+        markdown_content = "a\n" * max_char_count
+
+        # Write the markdown content to a temporary file
+        with open("temp.md", "w", encoding="utf-8") as f:
+            f.write(markdown_content)
+
+        # Call the function with the temporary file
+        result = markdown_too_large_for_issue_body("temp.md", max_char_count)
+
+        # remove the temporary file
+        os.remove("temp.md")
+
+        # Assert that the function returns True
+        self.assertTrue(result)
+
+    def test_split_markdown_file(self):
+        """
+        Test the split_markdown_file function.
+        """
+
+        # Define a sample markdown file content with 3 times the maximum character count
+        multiple_of_max = 4
+        max_char_count = 65535
+        repeated_content = "a\n"
+        markdown_content = repeated_content * int(
+            (max_char_count * multiple_of_max) / len(repeated_content)
+        )
+
+        # Write the markdown content to a temporary file
+        with open("temp.md", "w", encoding="utf-8") as f:
+            f.write(markdown_content)
+
+        # Call the function with the temporary file
+        split_markdown_file("temp.md", max_char_count)
+
+        # Assert that the function creates two files
+        self.assertTrue(os.path.exists("temp_0.md"))
+        self.assertTrue(os.path.exists("temp_1.md"))
+        self.assertTrue(os.path.exists("temp_2.md"))
+        self.assertTrue(os.path.exists("temp_3.md"))
+
+        # Assert that the all files have less than max characters
+        for i in range(0, multiple_of_max):
+            with open(f"temp_{i}.md", "r", encoding="utf-8") as f:
+                self.assertLessEqual(len(f.read()), max_char_count)
+
+        # remove the temporary files
+        os.remove("temp.md")
+        os.remove("temp_0.md")
+        os.remove("temp_1.md")
+        os.remove("temp_2.md")
+        os.remove("temp_3.md")
+
+
+if __name__ == "__main__":
+    unittest.main()