Skip to content

Commit

Permalink
Revert "Reduce filter list size by using domain popularity, updating …
Browse files Browse the repository at this point in the history
…encoding…"

This reverts commit 1db59c6.
  • Loading branch information
dharb authored Jan 15, 2025
1 parent 1db59c6 commit e8b7f81
Show file tree
Hide file tree
Showing 6 changed files with 9 additions and 247 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,3 @@ addon/*.js
.DS_Store
rules/filterlists/easylist_*.txt
lib/filterlist-engine.ts
data/top-1m.csv
8 changes: 0 additions & 8 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
"bulma": "^1.0.2",
"chai": "^5.0.0",
"chokidar-cli": "^3.0.0",
"csv-parse": "^5.6.0",
"esbuild": "^0.24.0",
"eslint": "^9.14.0",
"globals": "^15.11.0",
Expand Down
41 changes: 3 additions & 38 deletions scripts/compile-filterlist.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -14,52 +14,17 @@ const engine = FiltersEngine.parse(filterlistContent, {
loadCSPFilters: false,
});
const serializedEngine = engine.serialize();
const properlySizedBuffer = ensure32BitAlignment(serializedEngine.buffer);
const engineJson = JSON.stringify(Array.from(new Uint32Array(properlySizedBuffer)));
const engineLength = serializedEngine.length;

/**
* Ensures the buffer is aligned to a 32-bit boundary by padding if necessary
* @param {Buffer} originalBuffer
* @returns {Buffer} - The aligned buffer
*/

function ensure32BitAlignment(originalBuffer) {
// Calculate the necessary padding for 32-bit alignment
const padding = originalBuffer.byteLength % 4 === 0 ? 0 : 4 - (originalBuffer.byteLength % 4);

if (padding !== 0) {
// Create a new buffer with the required padding for 32-bit alignment
const paddedBuffer = new ArrayBuffer(originalBuffer.byteLength + padding);

// Create a Uint8Array view over the original and new buffers
const originalView = new Uint8Array(originalBuffer);
const paddedView = new Uint8Array(paddedBuffer);

// Copy the original data into the new buffer
paddedView.set(originalView);

// Set the padding bytes to zero
for (let i = originalBuffer.byteLength; i < paddedBuffer.byteLength; i++) {
paddedView[i] = 0;
}

return paddedBuffer;
}

// If already aligned, return the original buffer
return originalBuffer;
}
const engineJson = JSON.stringify(Array.from(serializedEngine));

fs.writeFile(
path.join(rulesDir, '../lib/filterlist-engine.ts'),
`
declare global {
const BUNDLE_FILTERLIST: boolean;
}
const serializedEngine = /* @__PURE__ */ new Uint8Array(new Uint32Array(
const serializedEngine = /* @__PURE__ */ new Uint8Array(
${engineJson}
).buffer).slice(0, ${engineLength});
);
const emptyEngine = /* @__PURE__ */ new Uint8Array([]);
export default BUNDLE_FILTERLIST ? serializedEngine : emptyEngine;
`,
Expand Down
200 changes: 4 additions & 196 deletions scripts/rebuild-filterlist.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -3,190 +3,14 @@
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { parse } from 'csv-parse/sync';
import { getDomain } from 'tldts-experimental';
import { Readable } from 'stream';
import { finished } from 'stream/promises';

const rulesDir = path.join(path.dirname(fileURLToPath(import.meta.url)), '../rules');
const dataDir = path.join(path.dirname(fileURLToPath(import.meta.url)), '../data');

const easylistRevision = fs.readFileSync(path.join(rulesDir, 'filterlists', 'easylist_revision.txt'), 'utf-8');
const domainSpecificLists = [
'easylist_cookie_specific_uBO.txt',
'easylist_cookie_specific_hide.txt',
'easylist_cookie_international_specific_hide.txt',
];

const MAX_DOMAIN_RANK = 100000;

/** @typedef {{ rules: Record<string, string[]>}} FilterlistJSON */

/** @type {Map<string, string>} */
let domainMap;

/**
* Processes a list containing domain-specific cosmetic rules, filters
* out unsupported rules and removes longtail domain rules
* @param {String} listFileName
*/
async function processDomainSpecificFilterList(listFileName) {
console.log(`Processing ${listFileName}`);
const data = fs.readFileSync(path.join(rulesDir, 'filterlists', listFileName), 'utf-8');
const lines = data.split('\n');
/** @type {FilterlistJSON} */
const filterlistJSON = { rules: {} };

// Remove unsupported rule types:
// ! at start of line indicates comment
// ## at start of line indicates a non-domain-specific rule
// || at start of line indicates network rule
// :remove is uBO syntax for removing an element from the DOM
// :upward is uBO syntax for iterating upward from an anchor element
// redirect-rule is uBO syntax for redirecting a request to a surrogate script
const filteredLines = lines.filter(
(line) =>
!line.startsWith('!') &&
!line.startsWith('##') &&
!line.startsWith('||') &&
!line.includes(':remove') &&
!line.includes(':upward') &&
!line.includes('redirect-rule'),
);
// Dump rules into json structure for parsing
for (const rule of filteredLines) {
const splitRule = rule.split('##');
const target = splitRule[0];
const action = splitRule[1];

if (!target || !action) {
continue;
}

if (!filterlistJSON.rules[target]) {
filterlistJSON.rules[target] = [action];
} else {
filterlistJSON.rules[target].push(action);
}
}

// Remove any targets that contain a +js rule. we do it this way because
// when a target has a :js rule, we must also remove any css rules
for (const item in filterlistJSON.rules) {
const containsJS = filterlistJSON.rules[item].some((filter) => filter.includes('+js'));
if (containsJS) {
delete filterlistJSON.rules[item];
}
}

for (const item in filterlistJSON.rules) {
let domains;

if (item.includes(',')) {
domains = item.split(',');
} else {
domains = [item];
}

const filteredDomains = domains.filter((domain) => {
const tlDomain = getDomain(domain);
return domainMap.has(tlDomain);
});

// If no domains meet cutoff, remove rule entirely. If one or more
// domains meet cutoff, remove those that don't
if (filteredDomains.length === 0) {
delete filterlistJSON.rules[item];
} else if (filteredDomains.length !== domains.length) {
const filteredDomainString = filteredDomains.join(',');
filterlistJSON.rules[filteredDomainString] = filterlistJSON.rules[item];
delete filterlistJSON.rules[item];
}
}

await convertAndWriteABP(filterlistJSON, listFileName);
}

/**
* Fetch domain popularity data from Tranco and load it into a Map for querying
*/
async function loadTrancoList() {
const csvLocation = path.resolve(path.join(dataDir, 'top-1m.csv'));
if (!fs.existsSync(csvLocation)) {
console.log('Fetching latest Tranco list..');
// Download most recent list from https://tranco-list.eu using API
const date = new Date();
let available;
let download;

do {
const formattedDate = date.toISOString().split('T')[0].replace('/', '');
const res = await (await fetch(`https://tranco-list.eu/api/lists/date/${formattedDate}`)).json();

available = res.available;
download = res.download;

// Decrement date until a daily list is available
date.setTime(date.getTime() - 7 * 24 * 3600000);
} while (!available);

const listData = await fetch(download);
await finished(Readable.fromWeb(listData.body).pipe(fs.createWriteStream(csvLocation)));
}

if (typeof domainMap !== 'object') {
const trancoCSV = fs.readFileSync(path.join(dataDir, 'top-1m.csv'), 'utf-8');
/** @type {string[][]} */
const records = parse(trancoCSV, {
columns: false,
skip_empty_lines: true,
});

domainMap = new Map();

records.slice(0, MAX_DOMAIN_RANK).forEach(([key, value]) => {
domainMap.set(value, key);
});
}
}

/**
* Convert filter list to ABP format and write to disk
* @param {FilterlistJSON} JSONList
* @param {String} fileName
*/
function convertAndWriteABP(JSONList, fileName) {
return new Promise((resolve, reject) => {
const fileLocation = path.join(rulesDir, 'filterlists', fileName);
const stream = fs.createWriteStream(fileLocation, { flags: 'w' });

stream.once('open', () => {
for (const item in JSONList.rules) {
JSONList.rules[item].forEach((filter) => {
const abpRule = item + '##' + filter;
stream.write(abpRule + '\n');
});
}
stream.end();
});

stream.on('finish', () => {
console.log(`ABP format list written to ${fileLocation}`);
resolve();
});

stream.on('error', (err) => {
console.error(`An error occurred while writing ${fileLocation}: ${err.message}`);
});
});
}

/**
* Combine all filter lists into a single filterlist.txt
*/
function combineFilterLists() {
// TODO: consider using python-abp (flrender) to generate filterlist properly
const filterlistContent = `[Adblock Plus 2.0]
// TODO: consider using python-abp (flrender) to generate filterlist properly
const filterlistContent = `
[Adblock Plus 2.0]
! Title: CPM Cosmetic Filter List
! Based on EasyList ${easylistRevision}
! DO NOT EDIT MANUALLY, your changes will be lost
Expand All @@ -204,20 +28,4 @@ ${fs.readFileSync(path.join(rulesDir, 'filterlists', 'easylist_cookie_allowlist_
${fs.readFileSync(path.join(rulesDir, 'filterlists', 'overrides.txt'), 'utf-8')}
`;

fs.writeFile(path.join(rulesDir, 'filterlist.txt'), filterlistContent, () => console.log('Written filterlist.txt'));
}

/**
* Parse and refine lists using domain popularity, then combine them and write filterlist.txt
*/
async function rebuildFilterList() {
await loadTrancoList();

for (const list of domainSpecificLists) {
await processDomainSpecificFilterList(list);
}

combineFilterLists();
}

rebuildFilterList();
fs.writeFile(path.join(rulesDir, 'filterlist.txt'), filterlistContent, () => console.log('Written filterlist.txt'));
5 changes: 2 additions & 3 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
"allowJs": true,
"checkJs": true,
"resolveJsonModule": true,
"moduleResolution": "NodeNext",
"moduleResolution": "node",
"noImplicitAny": true,
"allowSyntheticDefaultImports": true,
"module": "NodeNext"
"allowSyntheticDefaultImports": true
},
"exclude": ["node_modules/*", "dist/*"]
}

0 comments on commit e8b7f81

Please sign in to comment.