Skip to content

Commit

Permalink
Merge pull request #13969 from hmac/shared-extractor-globs
Browse files Browse the repository at this point in the history
Shared extractor: support file path globs
  • Loading branch information
hmac authored Aug 23, 2023
2 parents 6cf9968 + b76842a commit 96e9dfc
Show file tree
Hide file tree
Showing 7 changed files with 226 additions and 84 deletions.
45 changes: 42 additions & 3 deletions ql/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions ql/extractor/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,25 +34,25 @@ pub fn run(options: Options) -> std::io::Result<()> {
prefix: "ql",
ts_language: tree_sitter_ql::language(),
node_types: tree_sitter_ql::NODE_TYPES,
file_extensions: vec!["ql".into(), "qll".into()],
file_globs: vec!["*.ql".into(), "*.qll".into()],
},
simple::LanguageSpec {
prefix: "dbscheme",
ts_language: tree_sitter_ql_dbscheme::language(),
node_types: tree_sitter_ql_dbscheme::NODE_TYPES,
file_extensions: vec!["dbscheme".into()],
file_globs: vec!["*.dbscheme".into()],
},
simple::LanguageSpec {
prefix: "json",
ts_language: tree_sitter_json::language(),
node_types: tree_sitter_json::NODE_TYPES,
file_extensions: vec!["json".into(), "jsonl".into(), "jsonc".into()],
file_globs: vec!["*.json".into(), "*.jsonl".into(), "*.jsonc".into()],
},
simple::LanguageSpec {
prefix: "blame",
ts_language: tree_sitter_blame::language(),
node_types: tree_sitter_blame::NODE_TYPES,
file_extensions: vec!["blame".into()],
file_globs: vec!["*.blame".into()],
},
],
trap_dir: options.output_dir,
Expand Down
4 changes: 3 additions & 1 deletion shared/tree-sitter-extractor/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
[package]
name = "codeql-extractor"
version = "0.1.0"
version = "0.2.0"
edition = "2021"
authors = ["GitHub"]

[dependencies]
flate2 = "1.0"
globset = "0.4"
tree-sitter = "0.20"
tracing = "0.1"
rayon = "1.5.0"
Expand All @@ -19,4 +20,5 @@ num_cpus = "1.14.0"

[dev-dependencies]
tree-sitter-ql = { git = "https://github.com/tree-sitter/tree-sitter-ql" }
tree-sitter-json = {git = "https://github.com/tausbn/tree-sitter-json" }
rand = "0.8.5"
62 changes: 40 additions & 22 deletions shared/tree-sitter-extractor/src/extractor/simple.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use crate::trap;
use globset::{GlobBuilder, GlobSetBuilder};
use rayon::prelude::*;
use std::collections::HashMap;
use std::ffi::{OsStr, OsString};
use std::fs::File;
use std::io::BufRead;
use std::path::{Path, PathBuf};
Expand All @@ -13,7 +12,7 @@ pub struct LanguageSpec {
pub prefix: &'static str,
pub ts_language: tree_sitter::Language,
pub node_types: &'static str,
pub file_extensions: Vec<OsString>,
pub file_globs: Vec<String>,
}

pub struct Extractor {
Expand Down Expand Up @@ -83,16 +82,26 @@ impl Extractor {
schemas.push(schema);
}

// Construct a map from file extension -> LanguageSpec
let mut file_extension_language_mapping: HashMap<&OsStr, Vec<usize>> = HashMap::new();
for (i, lang) in self.languages.iter().enumerate() {
for (j, _ext) in lang.file_extensions.iter().enumerate() {
let indexes = file_extension_language_mapping
.entry(&lang.file_extensions[j])
.or_default();
indexes.push(i);
// Construct a single globset containing all language globs,
// and a mapping from glob index to language index.
let (globset, glob_language_mapping) = {
let mut builder = GlobSetBuilder::new();
let mut glob_lang_mapping = vec![];
for (i, lang) in self.languages.iter().enumerate() {
for glob_str in &lang.file_globs {
let glob = GlobBuilder::new(glob_str)
.literal_separator(true)
.build()
.expect("invalid glob");
builder.add(glob);
glob_lang_mapping.push(i);
}
}
}
(
builder.build().expect("failed to build globset"),
glob_lang_mapping,
)
};

let lines: std::io::Result<Vec<String>> =
std::io::BufReader::new(file_list).lines().collect();
Expand All @@ -108,18 +117,29 @@ impl Extractor {
let source = std::fs::read(&path)?;
let mut trap_writer = trap::Writer::new();

match path.extension() {
match path.file_name() {
None => {
tracing::error!(?path, "No extension found, skipping file.");
tracing::error!(?path, "No file name found, skipping file.");
}
Some(ext) => {
if let Some(indexes) = file_extension_language_mapping.get(ext) {
for i in indexes {
let lang = &self.languages[*i];
Some(filename) => {
let matches = globset.matches(filename);
if matches.is_empty() {
tracing::error!(?path, "No matching language found, skipping file.");
} else {
let mut languages_processed = vec![false; self.languages.len()];

for m in matches {
let i = glob_language_mapping[m];
if languages_processed[i] {
continue;
}
languages_processed[i] = true;
let lang = &self.languages[i];

crate::extractor::extract(
lang.ts_language,
lang.prefix,
&schemas[*i],
&schemas[i],
&mut diagnostics_writer,
&mut trap_writer,
&path,
Expand All @@ -130,11 +150,9 @@ impl Extractor {
std::fs::copy(&path, &src_archive_file)?;
write_trap(&self.trap_dir, &path, &trap_writer, trap_compression)?;
}
} else {
tracing::warn!(?path, "No language matches path, skipping file.");
}
}
};
}
Ok(()) as std::io::Result<()>
})
.expect("failed to extract files");
Expand Down
73 changes: 73 additions & 0 deletions shared/tree-sitter-extractor/tests/common/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
use std::io::{Read, Write};
use std::{
fs::File,
path::{Path, PathBuf},
};

use flate2::read::GzDecoder;

pub struct SourceArchive {
pub root_dir: PathBuf,
pub file_list: PathBuf,
pub source_archive_dir: PathBuf,
pub trap_dir: PathBuf,
}

pub fn create_source_dir(files: Vec<(&'static str, &'static str)>) -> SourceArchive {
let root_dir = std::env::temp_dir().join(format!("codeql-extractor-{}", rand::random::<u16>()));
std::fs::create_dir_all(&root_dir).unwrap();
let root_dir = root_dir
.canonicalize()
.expect("failed to canonicalize root directory");

let trap_dir = create_dir(&root_dir, "trap");
let source_archive_dir = create_dir(&root_dir, "src");

let mut file_paths = vec![];
for (filename, contents) in files {
let path = source_archive_dir.join(filename);
let mut file = File::create(&path).unwrap();
file.write_all(contents.as_bytes()).unwrap();
file_paths.push(PathBuf::from(path));
}

let file_list = {
let path = root_dir.join("files.txt");
let mut file = File::create(&path).unwrap();
for path in file_paths {
file.write_all(path.as_path().display().to_string().as_bytes())
.unwrap();
file.write_all(b"\n").unwrap();
}
path
};

SourceArchive {
root_dir,
file_list,
source_archive_dir,
trap_dir,
}
}

pub fn expect_trap_file(root_dir: &Path, filename: &str) {
let root_dir_relative = {
let r = root_dir.display().to_string();
r.strip_prefix("/").unwrap().to_string()
};
let trap_gz = root_dir
.join("trap")
.join(root_dir_relative)
.join("src")
.join(format!("{filename}.trap.gz"));
let mut decoder = GzDecoder::new(File::open(trap_gz).unwrap());
let mut first_line = [0; 31];
decoder.read_exact(&mut first_line).unwrap();
assert_eq!(first_line.as_slice(), b"// Auto-generated TRAP file for");
}

fn create_dir(root: &Path, path: impl AsRef<Path>) -> PathBuf {
let full_path = root.join(path);
std::fs::create_dir_all(&full_path).expect("Failed to create directory");
full_path.into()
}
Loading

0 comments on commit 96e9dfc

Please sign in to comment.