Skip to content

Commit

Permalink
WIP: chromedp transport integration for netnaija
Browse files Browse the repository at this point in the history
  • Loading branch information
deven96 committed Jul 12, 2020
1 parent 22213c6 commit fb3d7a7
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 11 deletions.
10 changes: 5 additions & 5 deletions engine/engines.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ type Engine interface {
// Scrape : Parse queries a url and return results
func Scrape(engine Engine) ([]Movie, error) {
// Config Vars
seleniumURL := fmt.Sprintf("%s/wd/hub", viper.GetString("selenium-url"))
// seleniumURL := fmt.Sprintf("%s/wd/hub", viper.GetString("selenium-url"))
cacheDir := viper.GetString("cache-dir")
var (
t *transport.Transport
t *transport.ChromeDpTransport
err error
)

Expand All @@ -70,8 +70,8 @@ func Scrape(engine Engine) ([]Movie, error) {

// Add Cloud Flare scraper bypasser
if engine.getName() == "NetNaija" {
log.Debug("Switching to Selenium transport")
t, err = transport.NewSeleniumTransport(http.DefaultTransport, seleniumURL)
log.Debug("Switching to ChromeDpTransport")
t, err = transport.NewChromeDpTransport(http.DefaultTransport)
if err != nil {
log.Fatal(err)
}
Expand All @@ -81,7 +81,7 @@ func Scrape(engine Engine) ([]Movie, error) {
// Close the WebDriver Instance
defer func() {
if engine.getName() == "NetNaija" {
t.WebDriver.Quit()
t.Cancel()
}
}()

Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.14
require (
github.com/bisoncorps/mplayer v0.0.0-20200330192254-e2f647162350
github.com/briandowns/spinner v1.11.1
github.com/chromedp/chromedp v0.5.3
github.com/gocolly/colly v1.2.0
github.com/gocolly/colly/v2 v2.0.2-0.20200302170631-ef2d2b016e78
github.com/iawia002/annie v0.0.0-20200217104547-c4b096ad402a
Expand Down
16 changes: 16 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ github.com/briandowns/spinner v1.11.1/go.mod h1:QOuQk7x+EaDASo80FEXwlwiA+j/PPIcX
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
github.com/cheggaaa/pb v1.0.25 h1:tFpebHTkI7QZx1q1rWGOKhbunhZ3fMaxTvHDWn1bH/4=
github.com/cheggaaa/pb v1.0.25/go.mod h1:pQciLPpbU0oxA0h+VJYYLxO+XeDQb5pZijXscXHm81s=
github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac h1:T7V5BXqnYd55Hj/g5uhDYumg9Fp3rMTS6bykYtTIFX4=
github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g=
github.com/chromedp/chromedp v0.5.3 h1:F9LafxmYpsQhWQBdCs+6Sret1zzeeFyHS5LkRF//Ffg=
github.com/chromedp/chromedp v0.5.3/go.mod h1:YLdPtndaHQ4rCpSpBG+IPpy9JvX0VD+7aaLxYgYj28w=
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e h1:fY5BOSpyZCqRo5OhCuC+XN+r/bBCmeuuJtjz+bCNIf8=
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
Expand Down Expand Up @@ -80,6 +84,12 @@ github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0=
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo=
github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8=
github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo=
github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
github.com/gocolly/colly/v2 v2.0.2-0.20200302170631-ef2d2b016e78 h1:BEK0DJ6e4lXgbYrClpJpa9Bh3IT7HunTNExaP9Y+6gI=
Expand Down Expand Up @@ -150,6 +160,8 @@ github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8Nz
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08 h1:V0an7KRw92wmJysvFvtqtKMAPmvS5O0jtB0nYo6t+gs=
github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08/go.mod h1:dFWs1zEqDjFtnBXsd1vPOZaLsESovai349994nHx3e0=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
Expand All @@ -163,6 +175,8 @@ github.com/lunixbochs/vtclean v0.0.0-20180621232353-2d01aacdc34a/go.mod h1:pHhQN
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/magiconair/properties v1.8.1 h1:ZC2Vc7/ZFkGmsVC9KvOjumD+G5lXy2RtTKyzRKO2BQ4=
github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mailru/easyjson v0.7.0 h1:aizVhC/NAAcKWb+5QsU1iNOZb4Yws5UO2I+aIprQITM=
github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs=
github.com/manifoldco/promptui v0.7.0 h1:3l11YT8tm9MnwGFQ4kETwkzpAwY2Jt9lCrumCUW4+z4=
github.com/manifoldco/promptui v0.7.0/go.mod h1:n4zTdgP0vr0S3w7/O/g98U+e0gwLScEXGwov2nIKuGQ=
github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU=
Expand Down Expand Up @@ -342,6 +356,8 @@ golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191104094858-e8c54fb511f6 h1:ZJUmhYTp8GbGC0ViZRc2U+MIYQ8xx9MscsdXnclfIhw=
golang.org/x/sys v0.0.0-20191104094858-e8c54fb511f6/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200116001909-b77594299b42 h1:vEOn+mP2zCOVzKckCZy6YsCtDblrpj/w7B9nxGNELpg=
golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
Expand Down
76 changes: 70 additions & 6 deletions transport/transport.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@ package transport

import (
"bytes"
"context"
"fmt"
"os"
"io/ioutil"
"net/http"
"os"
"strings"
"time"

"github.com/chromedp/chromedp"
log "github.com/sirupsen/logrus"
"github.com/tebeka/selenium"
"github.com/tebeka/selenium/firefox"
)
Expand All @@ -17,11 +20,17 @@ const (
userAgent = `Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36`
)

type Transport struct {
type SeleniumTransport struct {
upstream http.RoundTripper
WebDriver selenium.WebDriver
}

type ChromeDpTransport struct {
upstream http.RoundTripper
Ctx context.Context
Cancel context.CancelFunc
}

func NewClient() (c *http.Client, err error) {
seleniumURL := fmt.Sprintf("%s/wd/hub", os.Getenv("GOPHIE_SELENIUM_URL"))
fmt.Println("selenium url " + seleniumURL)
Expand All @@ -37,23 +46,78 @@ func NewClient() (c *http.Client, err error) {
return
}

func NewSeleniumTransport(upstream http.RoundTripper, seleniumURL string) (*Transport, error) {
func NewSeleniumTransport(upstream http.RoundTripper, seleniumURL string) (*SeleniumTransport, error) {

caps := selenium.Capabilities{"browserName": "firefox"}
firefoxCaps := firefox.Capabilities{Args: []string{"-headless"}}
caps.AddFirefox(firefoxCaps)
wd, err := selenium.NewRemote(caps, seleniumURL)

if err != nil {
return &Transport{}, err
return &SeleniumTransport{}, err
}
return &Transport{
return &SeleniumTransport{
upstream: upstream,
WebDriver: wd,
}, nil
}

func (t *Transport) RoundTrip(r *http.Request) (*http.Response, error) {
func NewChromeDpTransport(upstream http.RoundTripper) (*ChromeDpTransport, error) {

ctx, cancel := chromedp.NewContext(
context.Background(),
chromedp.WithLogf(log.Debugf),
)

return &ChromeDpTransport{
upstream: upstream,
Ctx: ctx,
Cancel: cancel,
}, nil
}

func (t *ChromeDpTransport) RoundTrip(r *http.Request) (*http.Response, error) {
var (
body string
err error
)

if r.Header.Get("User-Agent") == "" {
r.Header.Set("User-Agent", userAgent)
}

if r.Header.Get("Referer") == "" {
r.Header.Set("Referer", r.URL.String())
}

r.Header.Set("Content-Type", "text/html")

log.Debug("Set Headers for page ", r.URL.String())

if err = chromedp.Run(t.Ctx,
chromedp.Navigate(r.URL.String()),
chromedp.WaitVisible(`main`),
chromedp.OuterHTML("html", &body),
); err != nil {
return &http.Response{}, err
}
log.Debug("Successfully retrieved body")

response := &http.Response{
Status: "200 OK",
StatusCode: 200,
Proto: "HTTP/1.1",
ProtoMajor: 1,
ProtoMinor: 1,
Body: ioutil.NopCloser(bytes.NewBufferString(body)),
ContentLength: int64(len(body)),
Request: r,
Header: r.Header,
}
return response, nil
}

func (t *SeleniumTransport) RoundTrip(r *http.Request) (*http.Response, error) {
var (
title string
body string
Expand Down

0 comments on commit fb3d7a7

Please sign in to comment.