diff --git a/graphrag/index.py b/graphrag/index.py index 9b1fa9785b6..e44bea2f11f 100644 --- a/graphrag/index.py +++ b/graphrag/index.py @@ -21,6 +21,7 @@ import networkx as nx from api.db import LLMType from api.db.services.llm_service import LLMBundle +from api.db.services.user_service import TenantService from graphrag.community_reports_extractor import CommunityReportsExtractor from graphrag.entity_resolution import EntityResolution from graphrag.graph_extractor import GraphExtractor @@ -30,6 +31,11 @@ def be_children(obj: dict, keyset:set): + if isinstance(obj, str): + obj = [obj] + if isinstance(obj, list): + for i in obj: keyset.add(i) + return [{"id": i, "children":[]} for i in obj] arr = [] for k,v in obj.items(): k = re.sub(r"\*+", "", k) @@ -65,7 +71,8 @@ def graph_merge(g1, g2): def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]): - llm_bdl = LLMBundle(tenant_id, LLMType.CHAT) + _, tenant = TenantService.get_by_id(tenant_id) + llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id) ext = GraphExtractor(llm_bdl) left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024 left_token_count = max(llm_bdl.max_length * 0.8, left_token_count) diff --git a/graphrag/mind_map_extractor.py b/graphrag/mind_map_extractor.py index e4daae5ebd6..f5e80d7bf37 100644 --- a/graphrag/mind_map_extractor.py +++ b/graphrag/mind_map_extractor.py @@ -13,7 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +import collections +import logging +import re import logging import traceback from concurrent.futures import ThreadPoolExecutor @@ -65,7 +67,7 @@ def __call__( try: exe = ThreadPoolExecutor(max_workers=12) threads = [] - token_count = self._llm.max_length * 0.7 + token_count = max(self._llm.max_length * 0.8, self._llm.max_length-512) texts = [] res = [] cnt = 0 @@ -122,6 +124,19 @@ def _list_to_kv(self, data): continue return data + def _todict(self, layer:collections.OrderedDict): + to_ret = layer + if isinstance(layer, collections.OrderedDict): + to_ret = dict(layer) + + try: + for key, value in to_ret.items(): + to_ret[key] = self._todict(value) + except AttributeError: + pass + + return self._list_to_kv(to_ret) + def _process_document( self, text: str, prompt_variables: dict[str, str] ) -> str: @@ -132,6 +147,7 @@ def _process_document( text = perform_variable_replacements(self._mind_map_prompt, variables=variables) gen_conf = {"temperature": 0.5} response = self._llm.chat(text, [], gen_conf) + response = re.sub(r"```[^\n]*", "", response) print(response) - print("---------------------------------------------------\n", markdown_to_json.dictify(response)) - return dict(markdown_to_json.dictify(response)) + print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response))) + return self._todict(markdown_to_json.dictify(response)) diff --git a/graphrag/mind_map_prompt.py b/graphrag/mind_map_prompt.py index f6016b7ebd8..d87a7e8959c 100644 --- a/graphrag/mind_map_prompt.py +++ b/graphrag/mind_map_prompt.py @@ -14,28 +14,20 @@ # limitations under the License. # MIND_MAP_EXTRACTION_PROMPT = """ - - Role: You're a talent text processor. +- Role: You're a talent text processor to summarize a piece of text into a mind map. - - Step of task: - 1. Generate a title for user's 'TEXT'。 - 2. Classify the 'TEXT' into sections as you see fit. - 3. If the subject matter is really complex, split them into sub-sections. +- Step of task: + 1. Generate a title for user's 'TEXT'。 + 2. Classify the 'TEXT' into sections of a mind map. + 3. If the subject matter is really complex, split them into sub-sections and sub-subsections. + 4. Add a shot content summary of the bottom level section. + +- Output requirement: + - Always try to maximize the number of sub-sections. + - In language of 'Text' + - MUST IN FORMAT OF MARKDOWN - - Output requirement: - - In language of - - MUST IN FORMAT OF MARKDOWN - -Output: -## - <Section Name> - <Section Name> - <Subsection Name> - <Subsection Name> - <Section Name> - <Subsection Name> - -TEXT- {input_text} -Output: """ \ No newline at end of file