Skip to content

Commit

Permalink
Lexer should consider BOM for the start offset (#11732)
Browse files Browse the repository at this point in the history
## Summary

This PR fixes a bug where the lexer didn't consider the BOM into the
start offset.

fixes: #11731

## Test Plan

Add multiple test cases which involves BOM character in the source for
the lexer and verify the snapshot.
  • Loading branch information
dhruvmanila authored Jun 4, 2024
1 parent 3b19df0 commit 2567e14
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 15 deletions.
55 changes: 40 additions & 15 deletions crates/ruff_python_parser/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ mod cursor;
mod fstring;
mod indentation;

const BOM: char = '\u{feff}';

/// A lexer for Python source code.
#[derive(Debug)]
pub struct Lexer<'src> {
Expand Down Expand Up @@ -100,11 +102,10 @@ impl<'src> Lexer<'src> {
errors: Vec::new(),
};

// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
// spell-checker:ignore feff
lexer.cursor.eat_char('\u{feff}');

if start_offset > TextSize::new(0) {
if start_offset == TextSize::new(0) {
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
lexer.cursor.eat_char(BOM);
} else {
lexer.cursor.skip_bytes(start_offset.to_usize());
}

Expand Down Expand Up @@ -1922,8 +1923,8 @@ mod tests {
}
}

fn lex(source: &str, mode: Mode) -> LexerOutput {
let mut lexer = Lexer::new(source, mode, TextSize::default());
fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
let mut lexer = Lexer::new(source, mode, start_offset);
let mut tokens = Vec::new();
loop {
let kind = lexer.next_token();
Expand All @@ -1943,8 +1944,8 @@ mod tests {
}
}

fn lex_valid(source: &str, mode: Mode) -> LexerOutput {
let output = lex(source, mode);
fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
let output = lex(source, mode, start_offset);

if !output.errors.is_empty() {
let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
Expand All @@ -1959,7 +1960,7 @@ mod tests {
}

fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
let output = lex(source, mode);
let output = lex(source, mode, TextSize::default());

assert!(
!output.errors.is_empty(),
Expand All @@ -1970,11 +1971,35 @@ mod tests {
}

fn lex_source(source: &str) -> LexerOutput {
lex_valid(source, Mode::Module)
lex_valid(source, Mode::Module, TextSize::default())
}

fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
lex_valid(source, Mode::Module, start_offset)
}

fn lex_jupyter_source(source: &str) -> LexerOutput {
lex_valid(source, Mode::Ipython)
lex_valid(source, Mode::Ipython, TextSize::default())
}

#[test]
fn bom() {
let source = "\u{feff}x = 1";
assert_snapshot!(lex_source(source));
}

#[test]
fn bom_with_offset() {
let source = "\u{feff}x + y + z";
assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
}

#[test]
fn bom_with_offset_edge() {
// BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
// doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
let source = "\u{feff}x + y + z";
assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
}

fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
Expand Down Expand Up @@ -2118,7 +2143,7 @@ foo = ,func
def f(arg=%timeit a = b):
pass"
.trim();
let output = lex(source, Mode::Ipython);
let output = lex(source, Mode::Ipython, TextSize::default());
assert!(output.errors.is_empty());
assert_no_ipython_escape_command(&output.tokens);
}
Expand Down Expand Up @@ -2351,7 +2376,7 @@ if first:
}

fn get_tokens_only(source: &str) -> Vec<TokenKind> {
let output = lex(source, Mode::Module);
let output = lex(source, Mode::Module, TextSize::default());
assert!(output.errors.is_empty());
output.tokens.into_iter().map(|token| token.kind).collect()
}
Expand Down Expand Up @@ -2593,7 +2618,7 @@ f"{(lambda x:{x})}"
}

fn lex_fstring_error(source: &str) -> FStringErrorType {
let output = lex(source, Mode::Module);
let output = lex(source, Mode::Module, TextSize::default());
match output
.errors
.into_iter()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
source: crates/ruff_python_parser/src/lexer.rs
expression: lex_source(source)
---
## Tokens
```
[
(
Name(
"x",
),
3..4,
),
(
Equal,
5..6,
),
(
Int(
1,
),
7..8,
),
(
Newline,
8..8,
),
]
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
source: crates/ruff_python_parser/src/lexer.rs
expression: "lex_source_with_offset(source, TextSize::new(7))"
---
## Tokens
```
[
(
Name(
"y",
),
7..8,
),
(
Plus,
9..10,
),
(
Name(
"z",
),
11..12,
),
(
Newline,
12..12,
),
]
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
source: crates/ruff_python_parser/src/lexer.rs
expression: "lex_source_with_offset(source, TextSize::new(11))"
---
## Tokens
```
[
(
Name(
"z",
),
11..12,
),
(
Newline,
12..12,
),
]
```

0 comments on commit 2567e14

Please sign in to comment.