Increase max output tokens for gpt-4o and sonnet-3.5

nus-apr · Aug 9, 2024 · b1125ae · b1125ae
1 parent c09dc45
commit b1125ae
Show file tree

Hide file tree

Showing 8 changed files with 86 additions and 28 deletions.
diff --git a/Dockerfile.minimal b/Dockerfile.minimal
@@ -9,5 +9,7 @@ RUN apt update && apt install -y vim build-essential libssl-dev
 
 COPY . /opt/auto-code-rover
 
+ENV PYTHONPATH=/opt/auto-code-rover
+
 WORKDIR /opt/auto-code-rover
 RUN conda env create -f environment.yml
diff --git a/README.md b/README.md
@@ -155,12 +155,12 @@ and generate patch:
 ```
 cd /opt/auto-code-rover
 conda activate auto-code-rover
-PYTHONPATH=. python app/main.py github-issue --output-dir output --setup-dir setup --model gpt-4-0125-preview --model-temperature 0.2 --task-id <task id> --clone-link <link for cloning the project> --commit-hash <any version that has the issue> --issue-link <link to issue page>
+PYTHONPATH=. python app/main.py github-issue --output-dir output --setup-dir setup --model gpt-4o-2024-05-13 --model-temperature 0.2 --task-id <task id> --clone-link <link for cloning the project> --commit-hash <any version that has the issue> --issue-link <link to issue page>
 ```
 Here is an example command for running ACR on an issue from the langchain GitHub issue tracker:
 
 ```
-PYTHONPATH=. python app/main.py github-issue --output-dir output --setup-dir setup --model gpt-4-0125-preview --model-temperature 0.2 --task-id langchain-20453 --clone-link https://github.com/langchain-ai/langchain.git --commit-hash cb6e5e5 --issue-link https://github.com/langchain-ai/langchain/issues/20453
+PYTHONPATH=. python app/main.py github-issue --output-dir output --setup-dir setup --model gpt-4o-2024-05-13 --model-temperature 0.2 --task-id langchain-20453 --clone-link https://github.com/langchain-ai/langchain.git --commit-hash cb6e5e5 --issue-link https://github.com/langchain-ai/langchain/issues/20453
 ```
 
 The `<task id>` can be any string used to identify this issue.
@@ -189,7 +189,7 @@ and run the following commands:
 ```
 cd /opt/auto-code-rover
 conda activate auto-code-rover
-PYTHONPATH=. python app/main.py local-issue --output-dir output --model gpt-4-0125-preview --model-temperature 0.2 --task-id <task id> --local-repo <path to the local project repository> --issue-file <path to the file containing issue description>
+PYTHONPATH=. python app/main.py local-issue --output-dir output --model gpt-4o-2024-05-13 --model-temperature 0.2 --task-id <task id> --local-repo <path to the local project repository> --issue-file <path to the file containing issue description>
 ```
 
 If patch generation is successful, the path to the generated patch will be printed in the end.
@@ -243,7 +243,7 @@ Before running the task (`django__django-11133` here), make sure it has been set
 ```
 cd /opt/auto-code-rover
 conda activate auto-code-rover
-PYTHONPATH=. python app/main.py swe-bench --model gpt-4-0125-preview --setup-map ../SWE-bench/setup_result/setup_map.json --tasks-map ../SWE-bench/setup_result/tasks_map.json --output-dir output --task django__django-11133
+PYTHONPATH=. python app/main.py swe-bench --model gpt-4o-2024-05-13 --setup-map ../SWE-bench/setup_result/setup_map.json --tasks-map ../SWE-bench/setup_result/tasks_map.json --output-dir output --task django__django-11133
 ```
 
 The output of the run can then be found in `output/`. For example, the patch generated for `django__django-11133` can be found at a location like this: `output/applicable_patch/django__django-11133_yyyy-MM-dd_HH-mm-ss/extracted_patch_1.diff` (the date-time field in the directory name will be different depending on when the experiment was run).
@@ -255,7 +255,7 @@ First, put the id's of all tasks to run in a file, one per line. Suppose this fi
 ```
 cd /opt/auto-code-rover
 conda activate auto-code-rover
-PYTHONPATH=. python app/main.py swe-bench --model gpt-4-0125-preview --setup-map ../SWE-bench/setup_result/setup_map.json --tasks-map ../SWE-bench/setup_result/tasks_map.json --output-dir output --task-list-file /opt/SWE-bench/tasks.txt
+PYTHONPATH=. python app/main.py swe-bench --model gpt-4o-2024-05-13 --setup-map ../SWE-bench/setup_result/setup_map.json --tasks-map ../SWE-bench/setup_result/tasks_map.json --output-dir output --task-list-file /opt/SWE-bench/tasks.txt
 ```
 
 **NOTE**: make sure that the tasks in `tasks.txt` have all been set up in SWE-bench. See the steps [above](#set-up-one-or-more-tasks-in-swe-bench).
@@ -278,7 +278,9 @@ The current list of supported models:
 
 |  | Model | AutoCodeRover cmd line argument |
 |:--------------:|---------------|--------------|
-| OpenAI         | gpt-4-turbo-2024-04-09 | --model gpt-4-turbo-2024-04-09 |
+| OpenAI         | gpt-4o-2024-08-06      | --model gpt-4o-2024-08-06 |
+|                | gpt-4o-2024-05-13      | --model gpt-4o-2024-05-13 |
+|                | gpt-4-turbo-2024-04-09 | --model gpt-4-turbo-2024-04-09 |
 |                | gpt-4-0125-preview     | --model gpt-4-0125-preview |
 |                | gpt-4-1106-preview     | --model gpt-4-1106-preview |
 |                | gpt-3.5-turbo-0125     | --model gpt-3.5-turbo-0125 |

diff --git a/app/log.py b/app/log.py
@@ -10,7 +10,10 @@
 
 
 def terminal_width():
-    return get_terminal_size().columns
+    try:
+        return get_terminal_size().columns
+    except OSError:
+        return 80
 
 
 WIDTH = min(120, terminal_width() - 10)

diff --git a/app/main.py b/app/main.py
@@ -3,6 +3,7 @@
 """
 
 import json
+import shutil
 from argparse import ArgumentParser
 from collections.abc import Callable, Mapping, Sequence
 from concurrent.futures import ProcessPoolExecutor
@@ -154,7 +155,11 @@ def set_github_parser_args(parser: ArgumentParser) -> None:
     parser.add_argument(
         "--clone-link", type=str, help="The link to the repository to clone."
     )
-    parser.add_argument("--commit-hash", type=str, help="The commit hash to checkout.")
+    parser.add_argument(
+        "--commit-hash",
+        type=str,
+        help="The commit hash to checkout. If not specified, the latest commit on default branch will be used.",
+    )
     parser.add_argument("--issue-link", type=str, help="The link to the issue.")
     parser.add_argument(
         "--setup-dir",
@@ -468,11 +473,16 @@ def run_raw_task(
             f"Patch generation is disabled. Please find fix locations at: {task_output_dir}/fix_locations.json"
         )
     else:
+        output_patch_path = pjoin(task_output_dir, "final_patch.diff")
         final_patch_path = get_final_patch_path(task_output_dir)
         if final_patch_path is not None:
+            # cppy the final patch to the fixed path
+            shutil.copy2(final_patch_path, output_patch_path)
+
             log.log_and_always_print(
-                f"Please find the generated patch at: {final_patch_path}"
+                f"Please find the generated patch at: {output_patch_path}"
             )
+
             if isinstance(task, RawSweTask):
                 log.log_and_always_print(
                     "[SWE-bench mode] Note that the patch may be move to other paths in SWE-bench mode. "

diff --git a/app/model/claude.py b/app/model/claude.py
@@ -34,11 +34,13 @@ def __init__(
         name: str,
         cost_per_input: float,
         cost_per_output: float,
+        max_output_token: int = 4096,
         parallel_tool_call: bool = False,
     ):
         if self._initialized:
             return
         super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call)
+        self.max_output_token = max_output_token
         self._initialized = True
 
     def setup(self) -> None:
@@ -72,9 +74,13 @@ def call(
         top_p=1,
         tools=None,
         response_format: Literal["text", "json_object"] = "text",
+        temperature: float | None = None,
         **kwargs,
     ):
         # FIXME: ignore tools field since we don't use tools now
+        if temperature is None:
+            temperature = common.MODEL_TEMP
+
         try:
             # antropic models - prefilling response with { increase the success rate
             # of producing json output
@@ -85,8 +91,8 @@ def call(
             response = litellm.completion(
                 model=self.name,
                 messages=messages,
-                temperature=common.MODEL_TEMP,
-                max_tokens=1024,
+                temperature=temperature,
+                max_tokens=self.max_output_token,
                 top_p=top_p,
                 stream=False,
             )
@@ -122,7 +128,7 @@ def __init__(self):
         super().__init__(
             "claude-3-opus-20240229", 0.000015, 0.000075, parallel_tool_call=True
         )
-        self.note = "Most powerful model from Antropic"
+        self.note = "Most powerful model among Claude 3"
 
 
 class Claude3Sonnet(AnthropicModel):
@@ -144,6 +150,10 @@ def __init__(self):
 class Claude3_5Sonnet(AnthropicModel):
     def __init__(self):
         super().__init__(
-            "claude-3-5-sonnet-20240620", 0.000003, 0.000015, parallel_tool_call=True
+            "claude-3-5-sonnet-20240620",
+            0.000003,
+            0.000015,
+            max_output_token=8192,
+            parallel_tool_call=True,
         )
         self.note = "Most intelligent model from Antropic"
diff --git a/app/model/gpt.py b/app/model/gpt.py
@@ -7,6 +7,7 @@
 import sys
 from typing import Literal, cast
 
+from loguru import logger
 from openai import BadRequestError, OpenAI
 from openai.types.chat import (
     ChatCompletion,
@@ -45,13 +46,18 @@ def __new__(cls):
     def __init__(
         self,
         name: str,
+        max_output_token: int,
         cost_per_input: float,
         cost_per_output: float,
         parallel_tool_call: bool = False,
     ):
         if self._initialized:
             return
         super().__init__(name, cost_per_input, cost_per_output, parallel_tool_call)
+        # max number of output tokens allowed in model response
+        # sometimes we want to set a lower number for models with smaller context window,
+        # because output token limit consumes part of the context window
+        self.max_output_token = max_output_token
         # client for making request
         self.client: OpenAI | None = None
         self._initialized = True
@@ -84,7 +90,8 @@ def extract_resp_content(
             return content
 
     def extract_resp_func_calls(
-        self, chat_completion_message: ChatCompletionMessage
+        self,
+        chat_completion_message: ChatCompletionMessage,
     ) -> list[FunctionCallIntent]:
         """
         Given a chat completion message, extract the function calls from it.
@@ -124,6 +131,7 @@ def call(
         top_p: float = 1,
         tools: list[dict] | None = None,
         response_format: Literal["text", "json_object"] = "text",
+        temperature: float | None = None,
         **kwargs,
     ) -> tuple[
         str,
@@ -147,6 +155,9 @@ def call(
             Raw response and parsed components.
             The raw response is to be sent back as part of the message history.
         """
+        if temperature is None:
+            temperature = common.MODEL_TEMP
+
         assert self.client is not None
         try:
             if tools is not None and len(tools) == 1:
@@ -158,9 +169,9 @@ def call(
                     messages=messages,  # type: ignore
                     tools=tools,  # type: ignore
                     tool_choice=cast(ChatCompletionToolChoiceOptionParam, tool_choice),
-                    temperature=common.MODEL_TEMP,
+                    temperature=temperature,
                     response_format=ResponseFormat(type=response_format),
-                    max_tokens=1024,
+                    max_tokens=self.max_output_token,
                     top_p=top_p,
                     stream=False,
                 )
@@ -169,9 +180,9 @@ def call(
                     model=self.name,
                     messages=messages,  # type: ignore
                     tools=tools,  # type: ignore
-                    temperature=common.MODEL_TEMP,
+                    temperature=temperature,
                     response_format=ResponseFormat(type=response_format),
-                    max_tokens=1024,
+                    max_tokens=self.max_output_token,
                     top_p=top_p,
                     stream=False,
                 )
@@ -201,39 +212,48 @@ def call(
                 output_tokens,
             )
         except BadRequestError as e:
+            logger.debug("BadRequestError ({}): messages={}", e.code, messages)
             if e.code == "context_length_exceeded":
                 log_and_print("Context length exceeded")
             raise e
 
 
+class Gpt4o_20240806(OpenaiModel):
+    def __init__(self):
+        super().__init__(
+            "gpt-4o-2024-08-06", 16384, 0.0000025, 0.000010, parallel_tool_call=True
+        )
+        self.note = "Multimodal model. Up to Apr 2023."
+
+
 class Gpt4o_20240513(OpenaiModel):
     def __init__(self):
         super().__init__(
-            "gpt-4o-2024-05-13", 0.000005, 0.000015, parallel_tool_call=True
+            "gpt-4o-2024-05-13", 4096, 0.000005, 0.000015, parallel_tool_call=True
         )
         self.note = "Multimodal model. Up to Oct 2023."
 
 
 class Gpt4_Turbo20240409(OpenaiModel):
     def __init__(self):
         super().__init__(
-            "gpt-4-turbo-2024-04-09", 0.00001, 0.00003, parallel_tool_call=True
+            "gpt-4-turbo-2024-04-09", 4096, 0.00001, 0.00003, parallel_tool_call=True
         )
         self.note = "Turbo with vision. Up to Dec 2023."
 
 
 class Gpt4_0125Preview(OpenaiModel):
     def __init__(self):
         super().__init__(
-            "gpt-4-0125-preview", 0.00001, 0.00003, parallel_tool_call=True
+            "gpt-4-0125-preview", 4096, 0.00001, 0.00003, parallel_tool_call=True
         )
         self.note = "Turbo. Up to Dec 2023."
 
 
 class Gpt4_1106Preview(OpenaiModel):
     def __init__(self):
         super().__init__(
-            "gpt-4-1106-preview", 0.00001, 0.00003, parallel_tool_call=True
+            "gpt-4-1106-preview", 4096, 0.00001, 0.00003, parallel_tool_call=True
         )
         self.note = "Turbo. Up to Apr 2023."
 
@@ -242,32 +262,37 @@ class Gpt35_Turbo0125(OpenaiModel):
     # cheapest gpt model
     def __init__(self):
         super().__init__(
-            "gpt-3.5-turbo-0125", 0.0000005, 0.0000015, parallel_tool_call=True
+            "gpt-3.5-turbo-0125", 1024, 0.0000005, 0.0000015, parallel_tool_call=True
         )
         self.note = "Turbo. Up to Sep 2021."
 
 
 class Gpt35_Turbo1106(OpenaiModel):
     def __init__(self):
         super().__init__(
-            "gpt-3.5-turbo-1106", 0.000001, 0.000002, parallel_tool_call=True
+            "gpt-3.5-turbo-1106", 1024, 0.000001, 0.000002, parallel_tool_call=True
         )
         self.note = "Turbo. Up to Sep 2021."
 
 
 class Gpt35_Turbo16k_0613(OpenaiModel):
     def __init__(self):
-        super().__init__("gpt-3.5-turbo-16k-0613", 0.000003, 0.000004)
+        super().__init__("gpt-3.5-turbo-16k-0613", 1024, 0.000003, 0.000004)
         self.note = "Turbo. Deprecated. Up to Sep 2021."
 
 
 class Gpt35_Turbo0613(OpenaiModel):
     def __init__(self):
-        super().__init__("gpt-3.5-turbo-0613", 0.0000015, 0.000002)
+        super().__init__("gpt-3.5-turbo-0613", 512, 0.0000015, 0.000002)
         self.note = "Turbo. Deprecated. Only 4k window. Up to Sep 2021."
 
 
 class Gpt4_0613(OpenaiModel):
     def __init__(self):
-        super().__init__("gpt-4-0613", 0.00003, 0.00006)
+        super().__init__("gpt-4-0613", 512, 0.00003, 0.00006)
         self.note = "Not turbo. Up to Sep 2021."
+
+
+class Gpt4o_mini_20240718(OpenaiModel):
+    def __init__(self):
+        super().__init__("gpt-4o-mini-2024-07-18", 4096, 0.00000015, 0.0000006)
diff --git a/app/model/register.py b/app/model/register.py
@@ -6,6 +6,7 @@ def register_all_models() -> None:
     Register all models. This is called in main.
     """
     common.register_model(gpt.Gpt4o_20240513())
+    common.register_model(gpt.Gpt4o_mini_20240718())
     common.register_model(gpt.Gpt4_Turbo20240409())
     common.register_model(gpt.Gpt4_0125Preview())
     common.register_model(gpt.Gpt4_1106Preview())

diff --git a/app/raw_tasks.py b/app/raw_tasks.py
@@ -92,12 +92,13 @@ def __init__(
         self,
         task_id: str,
         clone_link: str,
-        commit_hash: str,
+        commit_hash: str | None,
         issue_link: str,
         setup_dir: str,
     ):
         self._task_id = task_id
         self.clone_link = clone_link
+        # if commit_hash is None, assume using the HEAD of default branch
         self.commit_hash = commit_hash
         self.issue_link = issue_link
         self.setup_dir = setup_dir
@@ -118,6 +119,10 @@ def clone_repo(self):
             shutil.rmtree(clone_path)
         app_utils.clone_repo(self.clone_link, str(clone_path.parent), clone_path.name)
         log_and_print(f"Cloned source code to {clone_path}.")
+        if self.commit_hash is None:
+            # let's get the current commit hash
+            with app_utils.cd(clone_path):
+                self.commit_hash = app_utils.get_current_commit_hash()
 
     def dump_meta_data(self, output_dir: str):
         meta = {