Skip to content

Commit

Permalink
add web crawler and publish new version
Browse files Browse the repository at this point in the history
  • Loading branch information
surgeharb committed Dec 2, 2019
1 parent 4180953 commit e31fce4
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 16 deletions.
47 changes: 41 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ npm install dynamic-parsers --save
# Multi-Platform Strings Files Manager

### Define Translations Data Structure
```js
```ts
// Platform is an exported enum in our StringsGenerator Class:
// export enum Platform { WEB = 0, IOS = 1, ANDROID = 2, API = 3 }

Expand All @@ -22,14 +22,14 @@ const translations = [
```

### Setup Generator
```js
```ts
const { StringsGenerator } = require('dynamic-parsers');

const generator = new StringsGenerator(translations, ['en', 'fr']);
```

### Option 1 - Write to a Local File
```js
```ts
// async/await
(async() => {

Expand All @@ -50,7 +50,7 @@ generator.generateZip(platform).then(zipFile => {
```

### Option 2 - Serve in API using a Simple Express Server
```js
```ts
app.get('/generator', async (req, res) => {

const platform = 0; // = Platform.WEB
Expand All @@ -76,7 +76,7 @@ app.get('/generator', async (req, res) => {

### Translations Interface

```js
```ts
export enum Platform { WEB = 0, IOS = 1, ANDROID = 2, API = 3 }

export interface Transalations {
Expand All @@ -91,7 +91,42 @@ export interface Transalations {

# Web Crawler and Scraper

### Keep in touch! Coming Soon!
```ts
const crawlerConfig = {
url: 'https://www.example.com/search/result?q=macbook',
itemSelector: 'ul.products-grid .item',
itemDetails: [
{ key: 'name', selector: '.product-name' },
{ key: 'sale', selector: '.product-sale .price' },
{ key: 'price', selector: '.product-details .price' },
{ key: 'image', selector: '.product-image img', attribute: 'data-src' },
]
};

(async () => {
const data = await WebCrawl(crawlerConfig);
console.log("LOG: data", data);
})();
```

### Crawler Config Structure

```ts
// either "html" or "url" is required

export interface ItemDetailsConfig {
key: string;
selector: string;
attribute?: string;
}

export interface CrawlerConfig {
url?: string;
html?: string;
itemSelector: string;
itemDetails: ItemDetailsConfig[];
}
```

<br>

Expand Down
132 changes: 126 additions & 6 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "dynamic-parsers",
"version": "1.1.0",
"version": "2.0.0",
"description": "Dynamic Parsers - Strings Files Generator and Web Crawler",
"homepage": "https://github.com/surgeharb",
"main": "build/index.js",
Expand All @@ -19,9 +19,11 @@
"@types/node": "^12.0.10",
"gulp": "^4.0.2",
"gulp-typescript": "^5.0.1",
"typescript": "^3.5.2"
"typescript": "^3.7.2"
},
"dependencies": {
"jszip": "^3.2.1"
"cheerio": "^1.0.0-rc.3",
"jszip": "^3.2.1",
"node-fetch": "^2.6.0"
}
}
51 changes: 51 additions & 0 deletions src/web-crawler/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import fetch from 'node-fetch';
import * as cheerio from 'cheerio';

export interface ItemDetailsConfig {
key: string;
selector: string;
attribute?: string;
}

export interface CrawlerConfig {
url?: string;
html?: string;
itemSelector: string;
itemDetails: ItemDetailsConfig[];
}

export const WebCrawl = async (config: CrawlerConfig) => {
const { url, itemSelector, itemDetails } = config;
let { html } = config;

if (url) {
const data = await fetch(url);
html = await data.text();
} else if (!html) {
console.error('Please specify either "html" or "url" in config');
return [];
}

const $ = cheerio.load(html);
const formattedData = [];

$(itemSelector).each((i: number, item: any) => {
const itemData = {};

itemDetails.forEach(option => {
let detailsElement = $(item).find(option.selector);

if (option.attribute) {
detailsElement = detailsElement?.attr(option.attribute) ?? '';
} else {
detailsElement = detailsElement?.text() ?? '';
}

itemData[option.key] = detailsElement.trim();
});

formattedData.push(itemData);
});

return formattedData;
}
2 changes: 1 addition & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"allowSyntheticDefaultImports": true,
"emitDecoratorMetadata": true,
"experimentalDecorators": true,
"target": "esnext",
"target": "es2018",
"sourceMap": true,
"outDir": "./build",
"baseUrl": "./src",
Expand Down

0 comments on commit e31fce4

Please sign in to comment.