From fb3d7a7c71f1a88d00332b9d9d415c068e741808 Mon Sep 17 00:00:00 2001 From: Diretnan Domnan Date: Sun, 12 Jul 2020 19:12:03 +0100 Subject: [PATCH] WIP: chromedp transport integration for netnaija --- engine/engines.go | 10 +++--- go.mod | 1 + go.sum | 16 +++++++++ transport/transport.go | 76 ++++++++++++++++++++++++++++++++++++++---- 4 files changed, 92 insertions(+), 11 deletions(-) diff --git a/engine/engines.go b/engine/engines.go index ad1a438..a7ca6d6 100644 --- a/engine/engines.go +++ b/engine/engines.go @@ -55,10 +55,10 @@ type Engine interface { // Scrape : Parse queries a url and return results func Scrape(engine Engine) ([]Movie, error) { // Config Vars - seleniumURL := fmt.Sprintf("%s/wd/hub", viper.GetString("selenium-url")) + // seleniumURL := fmt.Sprintf("%s/wd/hub", viper.GetString("selenium-url")) cacheDir := viper.GetString("cache-dir") var ( - t *transport.Transport + t *transport.ChromeDpTransport err error ) @@ -70,8 +70,8 @@ func Scrape(engine Engine) ([]Movie, error) { // Add Cloud Flare scraper bypasser if engine.getName() == "NetNaija" { - log.Debug("Switching to Selenium transport") - t, err = transport.NewSeleniumTransport(http.DefaultTransport, seleniumURL) + log.Debug("Switching to ChromeDpTransport") + t, err = transport.NewChromeDpTransport(http.DefaultTransport) if err != nil { log.Fatal(err) } @@ -81,7 +81,7 @@ func Scrape(engine Engine) ([]Movie, error) { // Close the WebDriver Instance defer func() { if engine.getName() == "NetNaija" { - t.WebDriver.Quit() + t.Cancel() } }() diff --git a/go.mod b/go.mod index d319677..46f65ce 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.14 require ( github.com/bisoncorps/mplayer v0.0.0-20200330192254-e2f647162350 github.com/briandowns/spinner v1.11.1 + github.com/chromedp/chromedp v0.5.3 github.com/gocolly/colly v1.2.0 github.com/gocolly/colly/v2 v2.0.2-0.20200302170631-ef2d2b016e78 github.com/iawia002/annie v0.0.0-20200217104547-c4b096ad402a diff --git a/go.sum b/go.sum index 78ce8e0..e3e0830 100644 --- a/go.sum +++ b/go.sum @@ -50,6 +50,10 @@ github.com/briandowns/spinner v1.11.1/go.mod h1:QOuQk7x+EaDASo80FEXwlwiA+j/PPIcX github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/cheggaaa/pb v1.0.25 h1:tFpebHTkI7QZx1q1rWGOKhbunhZ3fMaxTvHDWn1bH/4= github.com/cheggaaa/pb v1.0.25/go.mod h1:pQciLPpbU0oxA0h+VJYYLxO+XeDQb5pZijXscXHm81s= +github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac h1:T7V5BXqnYd55Hj/g5uhDYumg9Fp3rMTS6bykYtTIFX4= +github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g= +github.com/chromedp/chromedp v0.5.3 h1:F9LafxmYpsQhWQBdCs+6Sret1zzeeFyHS5LkRF//Ffg= +github.com/chromedp/chromedp v0.5.3/go.mod h1:YLdPtndaHQ4rCpSpBG+IPpy9JvX0VD+7aaLxYgYj28w= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e h1:fY5BOSpyZCqRo5OhCuC+XN+r/bBCmeuuJtjz+bCNIf8= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= @@ -80,6 +84,12 @@ github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0= +github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo= +github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8= +github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= +github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo= +github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM= github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= github.com/gocolly/colly/v2 v2.0.2-0.20200302170631-ef2d2b016e78 h1:BEK0DJ6e4lXgbYrClpJpa9Bh3IT7HunTNExaP9Y+6gI= @@ -150,6 +160,8 @@ github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8Nz github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08 h1:V0an7KRw92wmJysvFvtqtKMAPmvS5O0jtB0nYo6t+gs= +github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08/go.mod h1:dFWs1zEqDjFtnBXsd1vPOZaLsESovai349994nHx3e0= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= @@ -163,6 +175,8 @@ github.com/lunixbochs/vtclean v0.0.0-20180621232353-2d01aacdc34a/go.mod h1:pHhQN github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1 h1:ZC2Vc7/ZFkGmsVC9KvOjumD+G5lXy2RtTKyzRKO2BQ4= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= +github.com/mailru/easyjson v0.7.0 h1:aizVhC/NAAcKWb+5QsU1iNOZb4Yws5UO2I+aIprQITM= +github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs= github.com/manifoldco/promptui v0.7.0 h1:3l11YT8tm9MnwGFQ4kETwkzpAwY2Jt9lCrumCUW4+z4= github.com/manifoldco/promptui v0.7.0/go.mod h1:n4zTdgP0vr0S3w7/O/g98U+e0gwLScEXGwov2nIKuGQ= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= @@ -342,6 +356,8 @@ golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191104094858-e8c54fb511f6 h1:ZJUmhYTp8GbGC0ViZRc2U+MIYQ8xx9MscsdXnclfIhw= golang.org/x/sys v0.0.0-20191104094858-e8c54fb511f6/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200116001909-b77594299b42 h1:vEOn+mP2zCOVzKckCZy6YsCtDblrpj/w7B9nxGNELpg= +golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/transport/transport.go b/transport/transport.go index 25ca6b0..f693563 100644 --- a/transport/transport.go +++ b/transport/transport.go @@ -2,13 +2,16 @@ package transport import ( "bytes" + "context" "fmt" - "os" "io/ioutil" "net/http" + "os" "strings" "time" + "github.com/chromedp/chromedp" + log "github.com/sirupsen/logrus" "github.com/tebeka/selenium" "github.com/tebeka/selenium/firefox" ) @@ -17,11 +20,17 @@ const ( userAgent = `Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36` ) -type Transport struct { +type SeleniumTransport struct { upstream http.RoundTripper WebDriver selenium.WebDriver } +type ChromeDpTransport struct { + upstream http.RoundTripper + Ctx context.Context + Cancel context.CancelFunc +} + func NewClient() (c *http.Client, err error) { seleniumURL := fmt.Sprintf("%s/wd/hub", os.Getenv("GOPHIE_SELENIUM_URL")) fmt.Println("selenium url " + seleniumURL) @@ -37,7 +46,7 @@ func NewClient() (c *http.Client, err error) { return } -func NewSeleniumTransport(upstream http.RoundTripper, seleniumURL string) (*Transport, error) { +func NewSeleniumTransport(upstream http.RoundTripper, seleniumURL string) (*SeleniumTransport, error) { caps := selenium.Capabilities{"browserName": "firefox"} firefoxCaps := firefox.Capabilities{Args: []string{"-headless"}} @@ -45,15 +54,70 @@ func NewSeleniumTransport(upstream http.RoundTripper, seleniumURL string) (*Tran wd, err := selenium.NewRemote(caps, seleniumURL) if err != nil { - return &Transport{}, err + return &SeleniumTransport{}, err } - return &Transport{ + return &SeleniumTransport{ upstream: upstream, WebDriver: wd, }, nil } -func (t *Transport) RoundTrip(r *http.Request) (*http.Response, error) { +func NewChromeDpTransport(upstream http.RoundTripper) (*ChromeDpTransport, error) { + + ctx, cancel := chromedp.NewContext( + context.Background(), + chromedp.WithLogf(log.Debugf), + ) + + return &ChromeDpTransport{ + upstream: upstream, + Ctx: ctx, + Cancel: cancel, + }, nil +} + +func (t *ChromeDpTransport) RoundTrip(r *http.Request) (*http.Response, error) { + var ( + body string + err error + ) + + if r.Header.Get("User-Agent") == "" { + r.Header.Set("User-Agent", userAgent) + } + + if r.Header.Get("Referer") == "" { + r.Header.Set("Referer", r.URL.String()) + } + + r.Header.Set("Content-Type", "text/html") + + log.Debug("Set Headers for page ", r.URL.String()) + + if err = chromedp.Run(t.Ctx, + chromedp.Navigate(r.URL.String()), + chromedp.WaitVisible(`main`), + chromedp.OuterHTML("html", &body), + ); err != nil { + return &http.Response{}, err + } + log.Debug("Successfully retrieved body") + + response := &http.Response{ + Status: "200 OK", + StatusCode: 200, + Proto: "HTTP/1.1", + ProtoMajor: 1, + ProtoMinor: 1, + Body: ioutil.NopCloser(bytes.NewBufferString(body)), + ContentLength: int64(len(body)), + Request: r, + Header: r.Header, + } + return response, nil +} + +func (t *SeleniumTransport) RoundTrip(r *http.Request) (*http.Response, error) { var ( title string body string