Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#198 headless never is headless wait < 0 #199

Merged
merged 1 commit into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ require (
github.com/gocarina/gocsv v0.0.0-20211020200912-82fc2684cc48
github.com/gorilla/mux v1.8.0
github.com/gosuri/uiprogress v0.0.1
github.com/k4s/webrowser v0.0.0-20160107091637-934d526d0f27
github.com/knakk/rdf v0.0.0-20190304171630-8521bf4c5042
github.com/mafredri/cdp v0.32.0
github.com/minio/minio-go/v7 v7.0.52
Expand Down Expand Up @@ -57,7 +56,6 @@ require (
github.com/jmespath/go-jmespath v0.4.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/k4s/phantomgo v0.0.0-20161104020322-11963773aa04 // indirect
github.com/klauspost/compress v1.16.0 // indirect
github.com/klauspost/cpuid/v2 v2.2.4 // indirect
github.com/magiconair/properties v1.8.6 // indirect
Expand Down
4 changes: 0 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,6 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
github.com/k4s/phantomgo v0.0.0-20161104020322-11963773aa04 h1:tA/Xc0VnJtHIdxAML0WraKG+ErOYVgJ6oDcuxOloZOM=
github.com/k4s/phantomgo v0.0.0-20161104020322-11963773aa04/go.mod h1:YWxksSger0gUVO0tKEY/mVkyBTPoKAf4KX/S8Vt7ndc=
github.com/k4s/webrowser v0.0.0-20160107091637-934d526d0f27 h1:YBHxM4fmxQghvs3Ty/rQIPnY+tdCFheIOMj/h0Zw0A8=
github.com/k4s/webrowser v0.0.0-20160107091637-934d526d0f27/go.mod h1:kd1f/k6xHQrfwfszgeiZklsPzBNJJj/el6cjp86YowQ=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/godepgraph v0.0.0-20190626013829-57a7e4a651a9/go.mod h1:Gb5YEgxqiSSVrXKWQxDcKoCM94NO5QAwOwTaVmIUAMI=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
Expand Down
33 changes: 15 additions & 18 deletions internal/summoner/acquire/acquire.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,17 @@ func ResRetrieve(v1 *viper.Viper, mc *minio.Client, m map[string][]string, runSt
wg.Wait()
}

func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, error) {
func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, error) {
bucketName, err := configTypes.GetBucketName(v1)
if err != nil {
return bucketName, 0, 0, err
return bucketName, 0, 0, 0, err
}

var mcfg configTypes.Summoner
mcfg, err = configTypes.ReadSummmonerConfig(v1.Sub("summoner"))

if err != nil {
return bucketName, 0, 0, err
return bucketName, 0, 0, 0, err
}
// Set default thread counts and global delay
tc := mcfg.Threads
Expand All @@ -74,25 +74,24 @@ func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, error) {
// look for a domain specific override crawl delay
sources, err := configTypes.GetSources(v1)
source, err := configTypes.GetSourceByName(sources, sourceName)

hw := source.HeadlessWait
if err != nil {
return bucketName, tc, delay, err
return bucketName, tc, delay, hw, err
}

if source.Delay != 0 && source.Delay > delay {
delay = source.Delay
tc = 1
log.Info("Crawl delay set to ", delay, " for ", sourceName)
}

log.Info("Thread count ", tc, " delay ", delay)
return bucketName, tc, delay, nil
return bucketName, tc, delay, hw, nil
}

func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName string,
wg *sync.WaitGroup, repologger *log.Logger, repoStats *common.RepoStats) {

bucketName, tc, delay, err := getConfig(v1, sourceName)
bucketName, tc, delay, headlessWait, err := getConfig(v1, sourceName)
if err != nil {
// trying to read a source, so let's not kill everything with a panic/fatal
log.Error("Error reading config file ", err)
Expand Down Expand Up @@ -164,16 +163,14 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
// even is no JSON-LD packages found, record the event of checking this URL
if len(jsonlds) < 1 {
// TODO is her where I then try headless, and scope the following for into an else?
log.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Info("Direct access failed, trying headless for ", urlloc)
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Error() // this needs to go into the issues file
err := PageRenderAndUpload(v1, mc, 60*time.Second, urlloc, sourceName, repologger, repoStats) // TODO make delay configurable

if err != nil {
log.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error("PageRenderAndUpload ", urlloc, "::", err)
repologger.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error(err)
}
if err != nil {
log.Error("DB Update", urlloc, "::", err)
if headlessWait >= 0 {
log.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Info("Direct access failed, trying headless for ", urlloc)
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Error() // this needs to go into the issues file
err := PageRenderAndUpload(v1, mc, 60*time.Second, urlloc, sourceName, repologger, repoStats) // TODO make delay configurable
if err != nil {
log.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error("PageRenderAndUpload ", urlloc, "::", err)
repologger.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error(err)
}
}

} else {
Expand Down
2 changes: 1 addition & 1 deletion internal/summoner/acquire/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func RetrieveAPIData(apiSources []configTypes.Sources, mc *minio.Client, runStat

func getAPISource(v1 *viper.Viper, mc *minio.Client, source configTypes.Sources, wg *sync.WaitGroup, repologger *log.Logger, repoStats *common.RepoStats) {

bucketName, tc, delay, err := getConfig(v1, source.Name)
bucketName, tc, delay, _, err := getConfig(v1, source.Name) // _ is headless wait
if err != nil {
// trying to read a source, so let's not kill everything with a panic/fatal
log.Error("Error reading config file ", err)
Expand Down
4 changes: 4 additions & 0 deletions internal/summoner/acquire/headlessNG.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologge
sources, err := configTypes.GetSources(v1)
source, err := configTypes.GetSourceByName(sources, k)
headlessWait := source.HeadlessWait
if headlessWait < 0 {
log.Info("Headless wait on a headless configured to less that zero. Setting to 0")
headlessWait = 0 // if someone screws up the config, be good
}

if timeout*time.Duration(retries) < time.Duration(headlessWait)*time.Second {
timeout = time.Duration(headlessWait) * time.Second
Expand Down