Move semantic encoding into parser and parse on init

Generating the encoding once should speed up semantic token requests since they can be cached with the parser, and it will also make it easier to regression test. Parsing on init prevents having an "unparsed" parser, which would complicate accessing things like evaluated tokens and semantic encoding.
aazuspan · Aug 20, 2024 · 286d0ea · 286d0ea
1 parent 09104fb
commit 286d0ea
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 38 deletions.
diff --git a/src/spinasm_lsp/parser.py b/src/spinasm_lsp/parser.py
@@ -147,9 +147,14 @@ def __init__(self, source: str):
         # Store built-in constants that were defined at initialization.
         self._constants: list[str] = list(self.symtbl.keys())
 
-        self.evaluated_tokens: TokenLookup[LSPToken] = TokenLookup()
+        super().parse()
+
+        self.evaluated_tokens: TokenLookup[LSPToken] = self._evaluate_tokens()
         """Tokens with additional metadata after evaluation."""
 
+        self.semantic_encoding: list[int] = self._encode_semantics()
+        """Integer-encoded token semantics for semantic highlighting."""
+
     def __mkopcodes__(self):
         """
         No-op.
@@ -183,25 +188,39 @@ def __next__(self):
         ):
             self._definitions[base_token.stxt] = base_token.range
 
-    def _evaluate_token(self, token: ParsedToken) -> LSPToken:
-        """Evaluate a parsed token to determine its value and metadata."""
-        value = self.jmptbl.get(token.stxt, self.symtbl.get(token.stxt, None))
-        defined_range = self._definitions.get(token.without_address_modifier().stxt)
-
-        return LSPToken.from_parsed_token(
-            token=token,
-            value=value,
-            defined=defined_range,
-            is_constant=token.stxt in self._constants,
-            is_label=token.stxt in self.jmptbl,
-        )
-
-    def parse(self) -> SPINAsmParser:
-        """Parse and evaluate all tokens."""
-        super().parse()
+    def _evaluate_tokens(self) -> TokenLookup[LSPToken]:
+        """Evaluate all parsed tokens to determine their values and metadata."""
+        evaluated_tokens: TokenLookup[LSPToken] = TokenLookup()
 
         for token in self._parsed_tokens:
-            evaluated_token = self._evaluate_token(token)
-            self.evaluated_tokens.add_token(evaluated_token)
+            value = self.jmptbl.get(token.stxt, self.symtbl.get(token.stxt, None))
+            defined_range = self._definitions.get(token.without_address_modifier().stxt)
+            evaluated_token = LSPToken.from_parsed_token(
+                token=token,
+                value=value,
+                defined=defined_range,
+                is_constant=token.stxt in self._constants,
+                is_label=token.stxt in self.jmptbl,
+            )
+
+            evaluated_tokens.add_token(evaluated_token)
+
+        return evaluated_tokens
+
+    def _encode_semantics(self) -> list[int]:
+        """Encode the semantics of the parsed tokens for semantic highlighting."""
+        encoding: list[int] = []
+        prev_token_position = lsp.Position(0, 0)
+        for token in self.evaluated_tokens:
+            token_encoding = token.semantic_encoding(prev_token_position)
+
+            # Tokens without semantic encoding (e.g. operators) should be ignored so
+            # that the next encoding is relative to the last encoded token. Otherwise,
+            # character offsets would be incorrect.
+            if not token_encoding:
+                continue
+
+            encoding += token_encoding
+            prev_token_position = token.range.start
 
-        return self
+        return encoding
diff --git a/src/spinasm_lsp/server.py b/src/spinasm_lsp/server.py
@@ -21,7 +21,7 @@ def _parse_document(source: str) -> SPINAsmParser:
 
     Parser are cached based on the source code to speed up subsequent parsing.
     """
-    return SPINAsmParser(source).parse()
+    return SPINAsmParser(source)
 
 
 class SPINAsmLanguageServer(LanguageServer):
@@ -342,22 +342,7 @@ async def semantic_tokens(
     ls: SPINAsmLanguageServer, params: lsp.SemanticTokensParams
 ) -> lsp.SemanticTokens:
     parser = await ls.get_parser(params.text_document.uri)
-
-    encoding: list[int] = []
-    prev_token_position = lsp.Position(0, 0)
-    for token in parser.evaluated_tokens:
-        token_encoding = token.semantic_encoding(prev_token_position)
-
-        # Tokens without semantic encoding (e.g. operators) should be ignored so that
-        # the next encoding is relative to the last encoded token. Otherwise, character
-        # offsets would be incorrect.
-        if not token_encoding:
-            continue
-
-        encoding += token_encoding
-        prev_token_position = token.range.start
-
-    return lsp.SemanticTokens(data=encoding)
+    return lsp.SemanticTokens(data=parser.semantic_encoding)
 
 
 def start() -> None:

diff --git a/tests/test_tokens.py b/tests/test_tokens.py
@@ -142,7 +142,7 @@ def test_get_token_positions():
     with open(patch) as fp:
         source = fp.read()
 
-    parser = SPINAsmParser(source).parse()
+    parser = SPINAsmParser(source)
 
     all_matches = parser.evaluated_tokens.get(name="apout")
     assert len(all_matches) == 4