From 477b94f9ed4a052b901f44377e433ad2897f1d9d Mon Sep 17 00:00:00 2001 From: iamargus95 Date: Thu, 6 Apr 2023 11:51:39 +0530 Subject: [PATCH 1/7] replace net/url with urlutil Signed-off-by: iamargus95 --- cmd/tools/crawl-maze-score/main.go | 4 +- pkg/engine/common/base.go | 9 +- pkg/engine/hybrid/crawl.go | 9 +- pkg/engine/hybrid/hybrid.go | 4 +- pkg/engine/parser/files/robotstxt_test.go | 7 +- pkg/engine/parser/files/sitemapxml_test.go | 6 +- pkg/engine/parser/parser.go | 8 +- pkg/engine/parser/parser_test.go | 126 ++++++++++----------- pkg/output/fields.go | 11 +- pkg/types/crawler_options.go | 6 +- pkg/utils/extensions/extensions.go | 7 +- pkg/utils/scope/scope_test.go | 34 +++--- pkg/utils/utils.go | 4 +- 13 files changed, 121 insertions(+), 114 deletions(-) diff --git a/cmd/tools/crawl-maze-score/main.go b/cmd/tools/crawl-maze-score/main.go index 6bb4a709..f9d9385f 100644 --- a/cmd/tools/crawl-maze-score/main.go +++ b/cmd/tools/crawl-maze-score/main.go @@ -5,11 +5,11 @@ import ( "fmt" "log" "math" - "net/url" "os" "strings" "github.com/logrusorgru/aurora" + urlutil "github.com/projectdiscovery/utils/url" ) // expectedResults is the list of expected endpoints from security-crawl-maze @@ -171,7 +171,7 @@ func colorizeText(text string, value bool) string { } func strippedLink(link string) string { - parsed, _ := url.Parse(link) + parsed, _ := urlutil.Parse(link) return parsed.Path } diff --git a/pkg/engine/common/base.go b/pkg/engine/common/base.go index 588ff6fa..8d91e378 100644 --- a/pkg/engine/common/base.go +++ b/pkg/engine/common/base.go @@ -21,6 +21,7 @@ import ( "github.com/projectdiscovery/retryablehttp-go" errorutil "github.com/projectdiscovery/utils/errors" mapsutil "github.com/projectdiscovery/utils/maps" + urlutil "github.com/projectdiscovery/utils/url" "github.com/remeh/sizedwaitgroup" ) @@ -85,11 +86,11 @@ func (s *Shared) Enqueue(queue *queue.Queue, navigationRequests ...*navigation.R } func (s *Shared) ValidateScope(URL string, root string) bool { - parsedURL, err := url.Parse(URL) + parsed, err := urlutil.Parse(URL) if err != nil { return false } - scopeValidated, err := s.Options.ScopeManager.Validate(parsedURL, root) + scopeValidated, err := s.Options.ScopeManager.Validate(parsed.URL, root) return err == nil && scopeValidated } @@ -130,7 +131,7 @@ func (s *Shared) NewCrawlSessionWithURL(URL string) (*CrawlSession, error) { ctx, cancel = context.WithTimeout(ctx, time.Duration(s.Options.Options.CrawlDuration)*time.Second) } - parsed, err := url.Parse(URL) + parsed, err := urlutil.Parse(URL) if err != nil { //nolint return nil, errorutil.New("could not parse root URL").Wrap(err) @@ -173,7 +174,7 @@ func (s *Shared) NewCrawlSessionWithURL(URL string) (*CrawlSession, error) { crawlSession := &CrawlSession{ Ctx: ctx, CancelFunc: cancel, - URL: parsed, + URL: parsed.URL, Hostname: hostname, Queue: queue, HttpClient: httpclient, diff --git a/pkg/engine/hybrid/crawl.go b/pkg/engine/hybrid/crawl.go index cefb086f..bd0ce972 100644 --- a/pkg/engine/hybrid/crawl.go +++ b/pkg/engine/hybrid/crawl.go @@ -5,7 +5,7 @@ import ( "io" "net/http" "net/http/httputil" - "net/url" + "strings" "time" @@ -20,6 +20,7 @@ import ( errorutil "github.com/projectdiscovery/utils/errors" mapsutil "github.com/projectdiscovery/utils/maps" stringsutil "github.com/projectdiscovery/utils/strings" + urlutil "github.com/projectdiscovery/utils/url" ) func (c *Crawler) navigateRequest(s *common.CrawlSession, request *navigation.Request) (*navigation.Response, error) { @@ -41,7 +42,7 @@ func (c *Crawler) navigateRequest(s *common.CrawlSession, request *navigation.Re RequestStage: proto.FetchRequestStageResponse, }) go pageRouter.Start(func(e *proto.FetchRequestPaused) error { - URL, _ := url.Parse(e.Request.URL) + URL, _ := urlutil.Parse(e.Request.URL) body, _ := FetchGetResponseBody(page, e) headers := make(map[string][]string) for _, h := range e.ResponseHeaders { @@ -149,11 +150,11 @@ func (c *Crawler) navigateRequest(s *common.CrawlSession, request *navigation.Re return nil, errorutil.NewWithTag("hybrid", "could not get html").Wrap(err) } - parsed, err := url.Parse(request.URL) + parsed, err := urlutil.Parse(request.URL) if err != nil { return nil, errorutil.NewWithTag("hybrid", "url could not be parsed").Wrap(err) } - response.Resp.Request.URL = parsed + response.Resp.Request.URL = parsed.URL // Create a copy of intrapolated shadow DOM elements and parse them separately responseCopy := *response diff --git a/pkg/engine/hybrid/hybrid.go b/pkg/engine/hybrid/hybrid.go index 1d417656..eb130ed8 100644 --- a/pkg/engine/hybrid/hybrid.go +++ b/pkg/engine/hybrid/hybrid.go @@ -2,7 +2,6 @@ package hybrid import ( "fmt" - "net/url" "os" "github.com/go-rod/rod" @@ -12,6 +11,7 @@ import ( "github.com/projectdiscovery/katana/pkg/types" errorutil "github.com/projectdiscovery/utils/errors" stringsutil "github.com/projectdiscovery/utils/strings" + urlutil "github.com/projectdiscovery/utils/url" ps "github.com/shirou/gopsutil/v3/process" "go.uber.org/multierr" ) @@ -75,7 +75,7 @@ func New(options *types.CrawlerOptions) (*Crawler, error) { } if options.Options.Proxy != "" && options.Options.Headless { - proxyURL, err := url.Parse(options.Options.Proxy) + proxyURL, err := urlutil.Parse(options.Options.Proxy) if err != nil { return nil, err } diff --git a/pkg/engine/parser/files/robotstxt_test.go b/pkg/engine/parser/files/robotstxt_test.go index f791f5d0..f91a763d 100644 --- a/pkg/engine/parser/files/robotstxt_test.go +++ b/pkg/engine/parser/files/robotstxt_test.go @@ -2,10 +2,11 @@ package files import ( "net/http" - "net/url" + "strings" "testing" + urlutil "github.com/projectdiscovery/utils/url" "github.com/stretchr/testify/require" ) @@ -23,8 +24,8 @@ Disallow: /test/includes/ # Allow: /random/ Sitemap: https://example.com/sitemap.xml` - parsed, _ := url.Parse("http://localhost/robots.txt") - navigationRequests, err := crawler.parseReader(strings.NewReader(content), &http.Response{Request: &http.Request{URL: parsed}}) + parsed, _ := urlutil.Parse("http://localhost/robots.txt") + navigationRequests, err := crawler.parseReader(strings.NewReader(content), &http.Response{Request: &http.Request{URL: parsed.URL}}) require.Nil(t, err) for _, navReq := range navigationRequests { diff --git a/pkg/engine/parser/files/sitemapxml_test.go b/pkg/engine/parser/files/sitemapxml_test.go index 3929a9f5..3aa398ff 100644 --- a/pkg/engine/parser/files/sitemapxml_test.go +++ b/pkg/engine/parser/files/sitemapxml_test.go @@ -2,10 +2,10 @@ package files import ( "net/http" - "net/url" "strings" "testing" + urlutil "github.com/projectdiscovery/utils/url" "github.com/stretchr/testify/require" ) @@ -21,8 +21,8 @@ func TestSitemapXmlParseReader(t *testing.T) { 2019-06-19T12:00:00+00:00 ` - parsed, _ := url.Parse("http://security-crawl-maze.app/sitemap.xml") - navigationRequests, err := crawler.parseReader(strings.NewReader(content), &http.Response{Request: &http.Request{URL: parsed}}) + parsed, _ := urlutil.Parse("http://security-crawl-maze.app/sitemap.xml") + navigationRequests, err := crawler.parseReader(strings.NewReader(content), &http.Response{Request: &http.Request{URL: parsed.URL}}) require.Nil(t, err) for _, navReq := range navigationRequests { requests = append(requests, navReq.URL) diff --git a/pkg/engine/parser/parser.go b/pkg/engine/parser/parser.go index 82c24ada..03c4613f 100644 --- a/pkg/engine/parser/parser.go +++ b/pkg/engine/parser/parser.go @@ -10,6 +10,7 @@ import ( "github.com/projectdiscovery/katana/pkg/output" "github.com/projectdiscovery/katana/pkg/types" "github.com/projectdiscovery/katana/pkg/utils" + urlutil "github.com/projectdiscovery/utils/url" "golang.org/x/net/html" ) @@ -522,7 +523,7 @@ func bodyFormTagParser(resp *navigation.Response) (navigationRequests []*navigat return } - parsedURL, err := url.Parse(actionURL) + parsed, err := urlutil.Parse(actionURL) if err != nil { return } @@ -578,8 +579,9 @@ func bodyFormTagParser(resp *navigation.Response) (navigationRequests []*navigat } switch method { case "GET": - parsedURL.RawQuery = queryValuesWriter.Encode() - req.URL = parsedURL.String() + parsed.Update() + parsed.RawQuery = queryValuesWriter.Encode() + req.URL = parsed.URL.String() case "POST": if multipartWriter != nil { req.Body = sb.String() diff --git a/pkg/engine/parser/parser_test.go b/pkg/engine/parser/parser_test.go index 3aed5cb9..ac9a580f 100644 --- a/pkg/engine/parser/parser_test.go +++ b/pkg/engine/parser/parser_test.go @@ -2,7 +2,6 @@ package parser import ( "net/http" - "net/url" "regexp" "strings" "testing" @@ -10,57 +9,58 @@ import ( "github.com/PuerkitoBio/goquery" "github.com/projectdiscovery/katana/pkg/navigation" "github.com/projectdiscovery/katana/pkg/output" + urlutil "github.com/projectdiscovery/utils/url" "github.com/stretchr/testify/require" ) func TestHeaderParsers(t *testing.T) { - parsed, _ := url.Parse("https://security-crawl-maze.app/headers/xyz/") + parsed, _ := urlutil.Parse("https://security-crawl-maze.app/headers/xyz/") t.Run("content-location", func(t *testing.T) { - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}, Header: http.Header{"Content-Location": []string{"/test/headers/content-location.found"}}}} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}, Header: http.Header{"Content-Location": []string{"/test/headers/content-location.found"}}}} navigationRequests := headerContentLocationParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/headers/content-location.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("link", func(t *testing.T) { - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}, Header: http.Header{"Link": []string{"; rel=\"preload\""}}}} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}, Header: http.Header{"Link": []string{"; rel=\"preload\""}}}} navigationRequests := headerLinkParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/headers/link.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("location", func(t *testing.T) { - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}, Header: http.Header{"Location": []string{"http://security-crawl-maze.app/test/headers/location.found"}}}} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}, Header: http.Header{"Location": []string{"http://security-crawl-maze.app/test/headers/location.found"}}}} navigationRequests := headerLocationParser(resp) require.Equal(t, "http://security-crawl-maze.app/test/headers/location.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("refresh", func(t *testing.T) { - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}, Header: http.Header{"Refresh": []string{"999; url=/test/headers/refresh.found"}}}} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}, Header: http.Header{"Refresh": []string{"999; url=/test/headers/refresh.found"}}}} navigationRequests := headerRefreshParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/headers/refresh.found", navigationRequests[0].URL, "could not get correct url") }) } func TestBodyParsers(t *testing.T) { - parsed, _ := url.Parse("https://security-crawl-maze.app/html/body/xyz/") + parsed, _ := urlutil.Parse("https://security-crawl-maze.app/html/body/xyz/") t.Run("a", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyATagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/a/href.found", navigationRequests[0].URL, "could not get correct url") documentReader, _ = goquery.NewDocumentFromReader(strings.NewReader("")) - resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests = bodyATagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/a/ping.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("background", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyBackgroundTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/background.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("blockquote", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(`
`)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyBlockquoteCiteTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/blockquote/cite.found", navigationRequests[0].URL, "could not get correct url") }) @@ -68,21 +68,21 @@ func TestBodyParsers(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(` `)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyMapAreaPingTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/map/area/ping.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("audio", func(t *testing.T) { t.Run("src", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyAudioTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/audio/src.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("source", func(t *testing.T) { t.Run("src", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyAudioTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/audio/source/src.found", navigationRequests[0].URL, "could not get correct url") }) @@ -92,7 +92,7 @@ func TestBodyParsers(t *testing.T) { `)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} for _, navigationRequest := range bodyAudioTagParser(resp) { gotURL = append(gotURL, navigationRequest.URL) } @@ -106,25 +106,25 @@ func TestBodyParsers(t *testing.T) { t.Run("img", func(t *testing.T) { t.Run("dynsrc", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(``)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyImgTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/img/dynsrc.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("longdesc", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(``)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyImgTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/img/longdesc.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("lowsrc", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(``)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyImgTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/img/lowsrc.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("src", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(``)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyImgTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/img/src.found", navigationRequests[0].URL, "could not get correct url") }) @@ -132,7 +132,7 @@ func TestBodyParsers(t *testing.T) { var gotURL []string documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(``)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} for _, navigationResponse := range bodyImgTagParser(resp) { gotURL = append(gotURL, navigationResponse.URL) } @@ -147,7 +147,7 @@ func TestBodyParsers(t *testing.T) { // // parsed, _ = url.Parse("https://security-crawl-maze.app/html/body/frameset/frame/src.html") // var gotURL []string - // resp := navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Body: []byte(`

+ // resp := navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Body: []byte(`

// The test contains an inline string with known extension - /string-known-extension.pdf // The test contains an inline string - ./test/html/misc/string/dot-slash-prefix.found // The test contains an inline string - ../test/html/misc/string/dot-dot-slash-prefix.found @@ -165,19 +165,19 @@ func TestBodyParsers(t *testing.T) { }) t.Run("object", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(``)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyObjectTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/object/data.found", navigationRequests[0].URL, "could not get correct url") documentReader, _ = goquery.NewDocumentFromReader(strings.NewReader(``)) - resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests = bodyObjectTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/object/codebase.found", navigationRequests[0].URL, "could not get correct url") documentReader, _ = goquery.NewDocumentFromReader(strings.NewReader(` `)) - resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests = bodyObjectTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/object/param/value.found", navigationRequests[0].URL, "could not get correct url") }) @@ -185,20 +185,20 @@ func TestBodyParsers(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(` `)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodySvgTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/svg/image/xlink.found", navigationRequests[0].URL, "could not get correct url") documentReader, _ = goquery.NewDocumentFromReader(strings.NewReader(` `)) - resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests = bodySvgTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/svg/script/xlink.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("table", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(`
`)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyTableTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/table/background.found", navigationRequests[0].URL, "could not get correct url") @@ -207,80 +207,80 @@ func TestBodyParsers(t *testing.T) { `)) - resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests = bodyTableTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/table/td/background.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("video", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(``)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyVideoTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/video/poster.found", navigationRequests[0].URL, "could not get correct url") documentReader, _ = goquery.NewDocumentFromReader(strings.NewReader(``)) - resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests = bodyVideoTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/video/src.found", navigationRequests[0].URL, "could not get correct url") documentReader, _ = goquery.NewDocumentFromReader(strings.NewReader(``)) - resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests = bodyVideoTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/video/track/src.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("applet", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(``)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyAppletTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/applet/archive.found", navigationRequests[0].URL, "could not get correct url") documentReader, _ = goquery.NewDocumentFromReader(strings.NewReader(``)) - resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests = bodyAppletTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/applet/codebase.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("link", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyLinkHrefTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/css/font-face.css", navigationRequests[0].URL, "could not get correct url") documentReader, _ = goquery.NewDocumentFromReader(strings.NewReader(``)) - resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests = bodyLinkHrefTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/head/link/href.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("base", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(``)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyBaseHrefTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/head/base/href.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("manifest", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(``)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyHtmlManifestTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/manifest.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("doctype", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(` `)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyHtmlDoctypeTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/doctype.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("import", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader(``)) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyImportImplementationTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/head/import/implementation.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("embed", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyEmbedTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/embed/src.found", navigationRequests[0].URL, "could not get correct url") }) @@ -295,7 +295,7 @@ func TestBodyParsers(t *testing.T) { // // // `)) - // resp := navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + // resp := navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} // bodyFrameTagParser(resp, func(resp navigation.Request) { // gotURL = resp.URL // }) @@ -304,14 +304,14 @@ func TestBodyParsers(t *testing.T) { t.Run("iframe", func(t *testing.T) { t.Run("src", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyIframeTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/iframe/src.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("srcdoc", func(t *testing.T) { //var gotURL string //documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - //resp := navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + //resp := navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} //bodyIframeTagParser(resp, func(resp navigation.Request) { // gotURL = resp.URL //}) @@ -320,38 +320,38 @@ func TestBodyParsers(t *testing.T) { }) t.Run("input", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyInputSrcTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/input/src.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("isindex", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyIsindexActionTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/isindex/action.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("script", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyScriptSrcTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/script/src.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("button", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("

")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyButtonFormactionTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/form/button/formaction.found", navigationRequests[0].URL, "could not get correct url") }) t.Run("form", func(t *testing.T) { t.Run("get", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("
")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyFormTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/form/action-get.found?test1=test&test2=test", navigationRequests[0].URL, "could not get correct url") }) t.Run("post", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("
")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := bodyFormTagParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/body/form/action-post.found", navigationRequests[0].URL, "could not get correct url") require.Equal(t, "POST", navigationRequests[0].Method, "could not get correct method") @@ -361,28 +361,28 @@ func TestBodyParsers(t *testing.T) { t.Run("meta", func(t *testing.T) { // var gotURL string // documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - // resp := navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + // resp := navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} // bodyMetaContentTagParser(resp, func(resp navigation.Request) { // gotURL = resp.URL // }) // require.Equal(t, "https://security-crawl-maze.app/test/html/head/meta/content-redirect.found", gotURL, "could not get correct url") // // documentReader, _ = goquery.NewDocumentFromReader(strings.NewReader(``)) - // resp = navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + // resp = navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} // bodyMetaContentTagParser(resp, func(resp navigation.Request) { // gotURL = resp.URL // }) // require.Equal(t, "https://security-crawl-maze.app/test/html/head/meta/content-csp.found", gotURL, "could not get correct url") // // documentReader, _ = goquery.NewDocumentFromReader(strings.NewReader(``)) - // resp = navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + // resp = navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} // bodyMetaContentTagParser(resp, func(resp navigation.Request) { // gotURL = resp.URL // }) // require.Equal(t, "https://security-crawl-maze.app/test/html/head/meta/content-pinned-websites.found", gotURL, "could not get correct url") // // documentReader, _ = goquery.NewDocumentFromReader(strings.NewReader(``)) - // resp = navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + // resp = navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} // bodyMetaContentTagParser(resp, func(resp navigation.Request) { // gotURL = resp.URL // }) @@ -391,23 +391,23 @@ func TestBodyParsers(t *testing.T) { } func TestScriptParsers(t *testing.T) { - parsed, _ := url.Parse("https://security-crawl-maze.app/html/script/xyz/") + parsed, _ := urlutil.Parse("https://security-crawl-maze.app/html/script/xyz/") t.Run("content", func(t *testing.T) { documentReader, _ := goquery.NewDocumentFromReader(strings.NewReader("")) - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Reader: documentReader} + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Reader: documentReader} navigationRequests := scriptContentRegexParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/script/content.do", navigationRequests[0].URL, "could not get correct url") }) t.Run("js", func(t *testing.T) { - parsed, _ = url.Parse("https://security-crawl-maze.app/html/script/xyz/data.js") - resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}}, Body: "var endpoint='/test/html/script/body.do';"} + parsed, _ = urlutil.Parse("https://security-crawl-maze.app/html/script/xyz/data.js") + resp := &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Body: "var endpoint='/test/html/script/body.do';"} navigationRequests := scriptJSFileRegexParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/script/body.do", navigationRequests[0].URL, "could not get correct url") - parsed, _ = url.Parse("https://security-crawl-maze.app/html/script/xyz/") - resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed}, Header: http.Header{"Content-Type": []string{"application/javascript"}}}, Body: "var endpoint='/test/html/script/body-content-type.do';"} + parsed, _ = urlutil.Parse("https://security-crawl-maze.app/html/script/xyz/") + resp = &navigation.Response{Resp: &http.Response{Request: &http.Request{URL: parsed.URL}, Header: http.Header{"Content-Type": []string{"application/javascript"}}}, Body: "var endpoint='/test/html/script/body-content-type.do';"} navigationRequests = scriptJSFileRegexParser(resp) require.Equal(t, "https://security-crawl-maze.app/test/html/script/body-content-type.do", navigationRequests[0].URL, "could not get correct url") @@ -415,11 +415,11 @@ func TestScriptParsers(t *testing.T) { } func TestRegexBodyParsers(t *testing.T) { - parsed, _ := url.Parse("https://security-crawl-maze.app/contact") + parsed, _ := urlutil.Parse("https://security-crawl-maze.app/contact") t.Run("regexbody", func(t *testing.T) { output.CustomFieldsMap = make(map[string]output.CustomFieldConfig) resp := &navigation.Response{ - Resp: &http.Response{Request: &http.Request{URL: parsed}}, + Resp: &http.Response{Request: &http.Request{URL: parsed.URL}}, Depth: 0, Body: "some content contact@example.com", } @@ -439,7 +439,7 @@ func TestRegexBodyParsers(t *testing.T) { t.Run("regexheader", func(t *testing.T) { output.CustomFieldsMap = make(map[string]output.CustomFieldConfig) resp := &navigation.Response{ - Resp: &http.Response{Request: &http.Request{URL: parsed}, + Resp: &http.Response{Request: &http.Request{URL: parsed.URL}, Header: http.Header{ "server": []string{"ECS (dcb/7F84)"}, }, @@ -462,7 +462,7 @@ func TestRegexBodyParsers(t *testing.T) { t.Run("regexresponse", func(t *testing.T) { output.CustomFieldsMap = make(map[string]output.CustomFieldConfig) resp := &navigation.Response{ - Resp: &http.Response{Request: &http.Request{URL: parsed}, + Resp: &http.Response{Request: &http.Request{URL: parsed.URL}, Header: http.Header{ "server": []string{"ECS (dcb/7F84)"}, }, diff --git a/pkg/output/fields.go b/pkg/output/fields.go index 824625aa..0fc81fd5 100644 --- a/pkg/output/fields.go +++ b/pkg/output/fields.go @@ -9,6 +9,7 @@ import ( errorutil "github.com/projectdiscovery/utils/errors" stringsutil "github.com/projectdiscovery/utils/strings" + urlutil "github.com/projectdiscovery/utils/url" "golang.org/x/net/publicsuffix" ) @@ -59,7 +60,7 @@ func validateFieldNames(names string) error { // storeFields stores fields for a result into individual files // based on name. func storeFields(output *Result, storeFields []string) { - parsed, err := url.Parse(output.Request.URL) + parsed, err := urlutil.Parse(output.Request.URL) if err != nil { return } @@ -68,13 +69,13 @@ func storeFields(output *Result, storeFields []string) { etld, _ := publicsuffix.EffectiveTLDPlusOne(hostname) rootURL := fmt.Sprintf("%s://%s", parsed.Scheme, parsed.Host) for _, field := range storeFields { - if result := getValueForField(output, parsed, hostname, etld, rootURL, field); result != "" { - appendToFileField(parsed, field, result) + if result := getValueForField(output, parsed.URL, hostname, etld, rootURL, field); result != "" { + appendToFileField(parsed.URL, field, result) } if _, ok := CustomFieldsMap[field]; ok { results := getValueForCustomField(output) for _, result := range results { - appendToFileField(parsed, result.field, result.value) + appendToFileField(parsed.URL, result.field, result.value) } } } @@ -94,7 +95,7 @@ func appendToFileField(parsed *url.URL, field, data string) { // formatField formats output results based on fields from fieldNames func formatField(output *Result, fields string) []fieldOutput { var svalue []fieldOutput - parsed, _ := url.Parse(output.Request.URL) + parsed, _ := urlutil.Parse(output.Request.URL) if parsed == nil { return svalue } diff --git a/pkg/types/crawler_options.go b/pkg/types/crawler_options.go index 9ff7e200..90a0fca5 100644 --- a/pkg/types/crawler_options.go +++ b/pkg/types/crawler_options.go @@ -2,7 +2,6 @@ package types import ( "context" - "net/url" "time" "github.com/projectdiscovery/fastdialer/fastdialer" @@ -12,6 +11,7 @@ import ( "github.com/projectdiscovery/katana/pkg/utils/scope" "github.com/projectdiscovery/ratelimit" errorutil "github.com/projectdiscovery/utils/errors" + urlutil "github.com/projectdiscovery/utils/url" wappalyzer "github.com/projectdiscovery/wappalyzergo" ) @@ -115,12 +115,12 @@ func (c *CrawlerOptions) ValidatePath(path string) bool { // ValidateScope validates scope for an AbsURL func (c *CrawlerOptions) ValidateScope(absURL, rootHostname string) (bool, error) { - parsed, err := url.Parse(absURL) + parsed, err := urlutil.Parse(absURL) if err != nil { return false, err } if c.ScopeManager != nil { - return c.ScopeManager.Validate(parsed, rootHostname) + return c.ScopeManager.Validate(parsed.URL, rootHostname) } return true, nil } diff --git a/pkg/utils/extensions/extensions.go b/pkg/utils/extensions/extensions.go index e0efb760..148ac9e1 100644 --- a/pkg/utils/extensions/extensions.go +++ b/pkg/utils/extensions/extensions.go @@ -1,9 +1,10 @@ package extensions import ( - "net/url" "path" "strings" + + urlutil "github.com/projectdiscovery/utils/url" ) // defaultDenylist is the default list of extensions to be denied @@ -37,8 +38,8 @@ func NewValidator(extensionsMatch, extensionsFilter []string) *Validator { // ValidatePath returns true if an extension is allowed by the validator func (e *Validator) ValidatePath(item string) bool { var extension string - u, _ := url.Parse(item) - if u != nil { + u, _ := urlutil.Parse(item) + if u.Path != "" { extension = strings.ToLower(path.Ext(u.Path)) } else { extension = strings.ToLower(path.Ext(item)) diff --git a/pkg/utils/scope/scope_test.go b/pkg/utils/scope/scope_test.go index 2a53a8c3..6f8e794a 100644 --- a/pkg/utils/scope/scope_test.go +++ b/pkg/utils/scope/scope_test.go @@ -1,9 +1,9 @@ package scope import ( - "net/url" "testing" + urlutil "github.com/projectdiscovery/utils/url" "github.com/stretchr/testify/require" ) @@ -12,13 +12,13 @@ func TestManagerValidate(t *testing.T) { manager, err := NewManager([]string{`example`}, []string{`logout\.php`}, "dn", false) require.NoError(t, err, "could not create scope manager") - parsed, _ := url.Parse("https://test.com/index.php/example") - validated, err := manager.Validate(parsed, "test.com") + parsed, _ := urlutil.Parse("https://test.com/index.php/example") + validated, err := manager.Validate(parsed.URL, "test.com") require.NoError(t, err, "could not validate url") require.True(t, validated, "could not get correct in-scope validation") - parsed, _ = url.Parse("https://test.com/logout.php") - validated, err = manager.Validate(parsed, "another.com") + parsed, _ = urlutil.Parse("https://test.com/logout.php") + validated, err = manager.Validate(parsed.URL, "another.com") require.NoError(t, err, "could not validate url") require.False(t, validated, "could not get correct out-scope validation") }) @@ -27,8 +27,8 @@ func TestManagerValidate(t *testing.T) { manager, err := NewManager(nil, nil, "dn", false) require.NoError(t, err, "could not create scope manager") - parsed, _ := url.Parse("https://testanother.com/index.php") - validated, err := manager.Validate(parsed, "test.com") + parsed, _ := urlutil.Parse("https://testanother.com/index.php") + validated, err := manager.Validate(parsed.URL, "test.com") require.NoError(t, err, "could not validate host") require.True(t, validated, "could not get correct in-scope validation") }) @@ -36,8 +36,8 @@ func TestManagerValidate(t *testing.T) { manager, err := NewManager(nil, nil, "rdn", false) require.NoError(t, err, "could not create scope manager") - parsed, _ := url.Parse("https://subdomain.example.com/logout.php") - validated, err := manager.Validate(parsed, "example.com") + parsed, _ := urlutil.Parse("https://subdomain.example.com/logout.php") + validated, err := manager.Validate(parsed.URL, "example.com") require.NoError(t, err, "could not validate host") require.True(t, validated, "could not get correct in-scope validation") }) @@ -45,8 +45,8 @@ func TestManagerValidate(t *testing.T) { manager, err := NewManager(nil, nil, "rdn", false) require.NoError(t, err, "could not create scope manager") - parsed, _ := url.Parse("http://localhost:8082/logout.php") - validated, err := manager.Validate(parsed, "localhost") + parsed, _ := urlutil.Parse("http://localhost:8082/logout.php") + validated, err := manager.Validate(parsed.URL, "localhost") require.NoError(t, err, "could not validate host") require.True(t, validated, "could not get correct in-scope validation") }) @@ -54,18 +54,18 @@ func TestManagerValidate(t *testing.T) { manager, err := NewManager(nil, nil, "fqdn", false) require.NoError(t, err, "could not create scope manager") - parsed, _ := url.Parse("https://test.com/index.php") - validated, err := manager.Validate(parsed, "test.com") + parsed, _ := urlutil.Parse("https://test.com/index.php") + validated, err := manager.Validate(parsed.URL, "test.com") require.NoError(t, err, "could not validate host") require.True(t, validated, "could not get correct in-scope validation") - parsed, _ = url.Parse("https://subdomain.example.com/logout.php") - validated, err = manager.Validate(parsed, "example.com") + parsed, _ = urlutil.Parse("https://subdomain.example.com/logout.php") + validated, err = manager.Validate(parsed.URL, "example.com") require.NoError(t, err, "could not validate host") require.False(t, validated, "could not get correct out-scope validation") - parsed, _ = url.Parse("https://example.com/logout.php") - validated, err = manager.Validate(parsed, "another.com") + parsed, _ = urlutil.Parse("https://example.com/logout.php") + validated, err = manager.Validate(parsed.URL, "another.com") require.NoError(t, err, "could not validate host") require.False(t, validated, "could not get correct out-scope validation") }) diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index ec8b3a89..170ab9b2 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -1,11 +1,11 @@ package utils import ( - "net/url" "strings" "github.com/lukasbob/srcset" stringsutil "github.com/projectdiscovery/utils/strings" + urlutil "github.com/projectdiscovery/utils/url" ) // IsURL returns true if a provided string is URL @@ -73,7 +73,7 @@ func FlattenHeaders(headers map[string][]string) map[string]string { // ReplaceAllQueryParam replaces all the query param with the given value func ReplaceAllQueryParam(reqUrl, val string) string { - u, err := url.Parse(reqUrl) + u, err := urlutil.Parse(reqUrl) if err != nil { return reqUrl } From 56f1c837f0fb5c9950298f5dd0ce8276b0593889 Mon Sep 17 00:00:00 2001 From: Tarun Koyalwar Date: Fri, 7 Apr 2023 20:36:42 +0530 Subject: [PATCH 2/7] improved logging --- README.md | 4 +--- cmd/katana/main.go | 3 ++- cmd/tools/crawl-maze-score/main.go | 6 +++++- go.mod | 2 +- go.sum | 4 ++-- internal/runner/executer.go | 23 +++++++++++++++++++++- internal/runner/options.go | 12 +++++------ pkg/engine/common/base.go | 6 +++++- pkg/engine/hybrid/hybrid.go | 2 ++ pkg/engine/parser/files/robotstxt_test.go | 3 ++- pkg/engine/parser/files/sitemapxml_test.go | 3 ++- pkg/engine/parser/parser.go | 8 ++++---- pkg/engine/standard/standard.go | 3 ++- pkg/output/fields.go | 2 ++ pkg/types/options.go | 2 ++ pkg/utils/extensions/extensions.go | 6 +++++- pkg/utils/utils.go | 9 +++++++-- 17 files changed, 72 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 78d722ab..6afd7e59 100644 --- a/README.md +++ b/README.md @@ -109,9 +109,6 @@ This will display help for the tool. Here are all the switches it supports. Usage: ./katana [flags] -Katana is a fast crawler focused on execution in automation -pipelines offering both headless and non-headless crawling. - Flags: INPUT: -u, -list string[] target url / list to crawl @@ -182,6 +179,7 @@ OUTPUT: -nc, -no-color disable output content coloring (ANSI escape codes) -silent display output only -v, -verbose display verbose output + -debug display debug output -version display project version ``` diff --git a/cmd/katana/main.go b/cmd/katana/main.go index e5fa8180..2f34261f 100644 --- a/cmd/katana/main.go +++ b/cmd/katana/main.go @@ -140,6 +140,7 @@ pipelines offering both headless and non-headless crawling.`) flagSet.BoolVarP(&options.NoColors, "no-color", "nc", false, "disable output content coloring (ANSI escape codes)"), flagSet.BoolVar(&options.Silent, "silent", false, "display output only"), flagSet.BoolVarP(&options.Verbose, "verbose", "v", false, "display verbose output"), + flagSet.BoolVar(&options.Debug, "debug", false, "display debug output"), flagSet.BoolVar(&options.Version, "version", false, "display project version"), ) @@ -157,7 +158,7 @@ pipelines offering both headless and non-headless crawling.`) func init() { // show detailed stacktrace in debug mode - if os.Getenv("DEBUG") != "" { + if os.Getenv("DEBUG") == "true" { errorutil.ShowStackTrace = true } } diff --git a/cmd/tools/crawl-maze-score/main.go b/cmd/tools/crawl-maze-score/main.go index f9d9385f..dcc59c75 100644 --- a/cmd/tools/crawl-maze-score/main.go +++ b/cmd/tools/crawl-maze-score/main.go @@ -9,6 +9,7 @@ import ( "strings" "github.com/logrusorgru/aurora" + "github.com/projectdiscovery/gologger" urlutil "github.com/projectdiscovery/utils/url" ) @@ -171,7 +172,10 @@ func colorizeText(text string, value bool) string { } func strippedLink(link string) string { - parsed, _ := urlutil.Parse(link) + parsed, err := urlutil.Parse(link) + if err != nil { + gologger.Warning().Msgf("failed to parse link while extracting path: %v", err) + } return parsed.Path } diff --git a/go.mod b/go.mod index cde6ab0e..a6f2ed92 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/projectdiscovery/hmap v0.0.11 github.com/projectdiscovery/ratelimit v0.0.6 github.com/projectdiscovery/retryablehttp-go v1.0.14 - github.com/projectdiscovery/utils v0.0.17 + github.com/projectdiscovery/utils v0.0.19 github.com/projectdiscovery/wappalyzergo v0.0.87 github.com/remeh/sizedwaitgroup v1.0.0 github.com/rs/xid v1.4.0 diff --git a/go.sum b/go.sum index 5ac6e67e..60c1f1d2 100644 --- a/go.sum +++ b/go.sum @@ -168,8 +168,8 @@ github.com/projectdiscovery/retryablehttp-go v1.0.14 h1:PJaHQtHWE00xrmryZwhtma2b github.com/projectdiscovery/retryablehttp-go v1.0.14/go.mod h1:L5HwtGSvc0E3dNVtVqPACWOmr21Bbop2ZhpbCPYEeYU= github.com/projectdiscovery/stringsutil v0.0.2 h1:uzmw3IVLJSMW1kEg8eCStG/cGbYYZAja8BH3LqqJXMA= github.com/projectdiscovery/stringsutil v0.0.2/go.mod h1:EJ3w6bC5fBYjVou6ryzodQq37D5c6qbAYQpGmAy+DC0= -github.com/projectdiscovery/utils v0.0.17 h1:Y/uj8wAI1/a9UtWwDTBCBgsc2RicLUhrhcbCCsC7nrM= -github.com/projectdiscovery/utils v0.0.17/go.mod h1:Cu216AlQ7rAYa8aDBqB2OgNfu5p24Uj+tG9RxV8Wbfs= +github.com/projectdiscovery/utils v0.0.19 h1:5m70xVhBq86h5KsOMVWSO0H+PTmkwlOGAekWcHsaTxI= +github.com/projectdiscovery/utils v0.0.19/go.mod h1:Cu216AlQ7rAYa8aDBqB2OgNfu5p24Uj+tG9RxV8Wbfs= github.com/projectdiscovery/wappalyzergo v0.0.87 h1:IgWgMWU7jjpR5EBvNN2yP/uSNHl02+SPYaorG4KJ6Fs= github.com/projectdiscovery/wappalyzergo v0.0.87/go.mod h1:HvYuW0Be4JCjVds/+XAEaMSqRG9yrI97UmZq0TPk6A0= github.com/remeh/sizedwaitgroup v1.0.0 h1:VNGGFwNo/R5+MJBf6yrsr110p0m4/OX4S3DCy7Kyl5E= diff --git a/internal/runner/executer.go b/internal/runner/executer.go index b6acddcc..e3d4033c 100644 --- a/internal/runner/executer.go +++ b/internal/runner/executer.go @@ -1,8 +1,11 @@ package runner import ( + "strings" + "github.com/projectdiscovery/gologger" errorutil "github.com/projectdiscovery/utils/errors" + urlutil "github.com/projectdiscovery/utils/url" "github.com/remeh/sizedwaitgroup" ) @@ -18,7 +21,7 @@ func (r *Runner) ExecuteCrawling() error { wg := sizedwaitgroup.New(r.options.Parallelism) for _, input := range inputs { wg.Add() - + input = addSchemeIfNotExists(input) go func(input string) { defer wg.Done() @@ -30,3 +33,21 @@ func (r *Runner) ExecuteCrawling() error { wg.Wait() return nil } + +// scheme less urls are skipped and are required for headless mode and other purposes +// this method adds scheme if given input does not have any +func addSchemeIfNotExists(inputURL string) string { + if strings.HasPrefix(inputURL, urlutil.HTTP) || strings.HasPrefix(inputURL, urlutil.HTTPS) { + return inputURL + } + parsed, err := urlutil.Parse(inputURL) + if err != nil { + gologger.Warning().Msgf("input %v is not a valid url got %v", inputURL, err) + return inputURL + } + if parsed.Port() != "" && (parsed.Port() == "80" || parsed.Port() == "8080") { + return urlutil.HTTP + urlutil.SchemeSeparator + inputURL + } else { + return urlutil.HTTPS + urlutil.SchemeSeparator + inputURL + } +} diff --git a/internal/runner/options.go b/internal/runner/options.go index 5b355254..432f1a53 100644 --- a/internal/runner/options.go +++ b/internal/runner/options.go @@ -22,9 +22,6 @@ func validateOptions(options *types.Options) error { if options.MaxDepth <= 0 && options.CrawlDuration <= 0 { return errorutil.New("either max-depth or crawl-duration must be specified") } - if options.Verbose { - gologger.DefaultLogger.SetMaxLevel(levels.LevelVerbose) - } if len(options.URLs) == 0 && !fileutil.HasStdin() { return errorutil.New("no inputs specified for crawler") } @@ -107,9 +104,12 @@ func normalizeInput(value string) string { func configureOutput(options *types.Options) { if options.Silent { gologger.DefaultLogger.SetMaxLevel(levels.LevelSilent) - } - if options.Verbose { - gologger.DefaultLogger.SetMaxLevel(levels.LevelVerbose) + } else if options.Verbose { + gologger.DefaultLogger.SetMaxLevel(levels.LevelWarning) + } else if options.Debug { + gologger.DefaultLogger.SetMaxLevel(levels.LevelDebug) + } else { + gologger.DefaultLogger.SetMaxLevel(levels.LevelInfo) } // logutil.DisableDefaultLogger() diff --git a/pkg/engine/common/base.go b/pkg/engine/common/base.go index 8d91e378..f2bc1012 100644 --- a/pkg/engine/common/base.go +++ b/pkg/engine/common/base.go @@ -88,6 +88,7 @@ func (s *Shared) Enqueue(queue *queue.Queue, navigationRequests ...*navigation.R func (s *Shared) ValidateScope(URL string, root string) bool { parsed, err := urlutil.Parse(URL) if err != nil { + gologger.Warning().Msgf("failed to parse url while validating scope: %v", err) return false } scopeValidated, err := s.Options.ScopeManager.Validate(parsed.URL, root) @@ -197,18 +198,21 @@ func (s *Shared) Do(crawlSession *CrawlSession, doRequest DoRequestFunc) error { } if !utils.IsURL(req.URL) { + gologger.Debug().Msgf("`%v` not a url. skipping", req.URL) continue } if ok, err := s.Options.ValidateScope(req.URL, crawlSession.Hostname); err != nil || !ok { + gologger.Debug().Msgf("`%v` not in scope. skipping", req.URL) continue } if !s.Options.ValidatePath(req.URL) { + gologger.Debug().Msgf("`%v` not a valid path. skipping", req.URL) continue } wg.Add() - + // gologger.Debug().Msgf("Visting: %v", req.URL) // not sure if this is needed go func() { defer wg.Done() diff --git a/pkg/engine/hybrid/hybrid.go b/pkg/engine/hybrid/hybrid.go index eb130ed8..2b562ed7 100644 --- a/pkg/engine/hybrid/hybrid.go +++ b/pkg/engine/hybrid/hybrid.go @@ -7,6 +7,7 @@ import ( "github.com/go-rod/rod" "github.com/go-rod/rod/lib/launcher" "github.com/go-rod/rod/lib/launcher/flags" + "github.com/projectdiscovery/gologger" "github.com/projectdiscovery/katana/pkg/engine/common" "github.com/projectdiscovery/katana/pkg/types" errorutil "github.com/projectdiscovery/utils/errors" @@ -146,6 +147,7 @@ func (c *Crawler) Crawl(rootURL string) error { } } + gologger.Info().Msgf("Started Headless Crawling Target: %v", rootURL) if err := c.Do(crawlSession, c.navigateRequest); err != nil { return errorutil.NewWithErr(err).WithTag("standard") } diff --git a/pkg/engine/parser/files/robotstxt_test.go b/pkg/engine/parser/files/robotstxt_test.go index f91a763d..a60882a7 100644 --- a/pkg/engine/parser/files/robotstxt_test.go +++ b/pkg/engine/parser/files/robotstxt_test.go @@ -24,7 +24,8 @@ Disallow: /test/includes/ # Allow: /random/ Sitemap: https://example.com/sitemap.xml` - parsed, _ := urlutil.Parse("http://localhost/robots.txt") + parsed, err := urlutil.Parse("http://localhost/robots.txt") + require.Nil(t, err) navigationRequests, err := crawler.parseReader(strings.NewReader(content), &http.Response{Request: &http.Request{URL: parsed.URL}}) require.Nil(t, err) diff --git a/pkg/engine/parser/files/sitemapxml_test.go b/pkg/engine/parser/files/sitemapxml_test.go index 3aa398ff..a3816e8b 100644 --- a/pkg/engine/parser/files/sitemapxml_test.go +++ b/pkg/engine/parser/files/sitemapxml_test.go @@ -21,7 +21,8 @@ func TestSitemapXmlParseReader(t *testing.T) { 2019-06-19T12:00:00+00:00 ` - parsed, _ := urlutil.Parse("http://security-crawl-maze.app/sitemap.xml") + parsed, err := urlutil.Parse("http://security-crawl-maze.app/sitemap.xml") + require.Nil(t, err) navigationRequests, err := crawler.parseReader(strings.NewReader(content), &http.Response{Request: &http.Request{URL: parsed.URL}}) require.Nil(t, err) for _, navReq := range navigationRequests { diff --git a/pkg/engine/parser/parser.go b/pkg/engine/parser/parser.go index 03c4613f..c836c5bd 100644 --- a/pkg/engine/parser/parser.go +++ b/pkg/engine/parser/parser.go @@ -2,10 +2,10 @@ package parser import ( "mime/multipart" - "net/url" "strings" "github.com/PuerkitoBio/goquery" + "github.com/projectdiscovery/gologger" "github.com/projectdiscovery/katana/pkg/navigation" "github.com/projectdiscovery/katana/pkg/output" "github.com/projectdiscovery/katana/pkg/types" @@ -525,12 +525,13 @@ func bodyFormTagParser(resp *navigation.Response) (navigationRequests []*navigat parsed, err := urlutil.Parse(actionURL) if err != nil { + gologger.Warning().Msgf("bodyFormTagParser :failed to parse url %v got %v", actionURL, err) return } isMultipartForm := strings.HasPrefix(encType, "multipart/") - queryValuesWriter := make(url.Values) + queryValuesWriter := make(urlutil.Params) var sb strings.Builder var multipartWriter *multipart.Writer @@ -579,8 +580,7 @@ func bodyFormTagParser(resp *navigation.Response) (navigationRequests []*navigat } switch method { case "GET": - parsed.Update() - parsed.RawQuery = queryValuesWriter.Encode() + parsed.Params.Merge(queryValuesWriter) req.URL = parsed.URL.String() case "POST": if multipartWriter != nil { diff --git a/pkg/engine/standard/standard.go b/pkg/engine/standard/standard.go index 55240462..ee00f6c0 100644 --- a/pkg/engine/standard/standard.go +++ b/pkg/engine/standard/standard.go @@ -1,6 +1,7 @@ package standard import ( + "github.com/projectdiscovery/gologger" "github.com/projectdiscovery/katana/pkg/engine/common" "github.com/projectdiscovery/katana/pkg/types" errorutil "github.com/projectdiscovery/utils/errors" @@ -32,7 +33,7 @@ func (c *Crawler) Crawl(rootURL string) error { return errorutil.NewWithErr(err).WithTag("standard") } defer crawlSession.CancelFunc() - + gologger.Info().Msgf("Started Crawling Target: %v", rootURL) if err := c.Do(crawlSession, c.makeRequest); err != nil { return errorutil.NewWithErr(err).WithTag("standard") } diff --git a/pkg/output/fields.go b/pkg/output/fields.go index 0fc81fd5..e22dc7b4 100644 --- a/pkg/output/fields.go +++ b/pkg/output/fields.go @@ -7,6 +7,7 @@ import ( "path" "strings" + "github.com/projectdiscovery/gologger" errorutil "github.com/projectdiscovery/utils/errors" stringsutil "github.com/projectdiscovery/utils/strings" urlutil "github.com/projectdiscovery/utils/url" @@ -62,6 +63,7 @@ func validateFieldNames(names string) error { func storeFields(output *Result, storeFields []string) { parsed, err := urlutil.Parse(output.Request.URL) if err != nil { + gologger.Warning().Msgf("storeFields: failed to parse url %v got %v", output.Request.URL, err) return } diff --git a/pkg/types/options.go b/pkg/types/options.go index 6dbaa96c..7b55ba8e 100644 --- a/pkg/types/options.go +++ b/pkg/types/options.go @@ -120,6 +120,8 @@ type Options struct { DisableUpdateCheck bool //IgnoreQueryParams ignore crawling same path with different query-param values IgnoreQueryParams bool + // Debug + Debug bool } func (options *Options) ParseCustomHeaders() map[string]string { diff --git a/pkg/utils/extensions/extensions.go b/pkg/utils/extensions/extensions.go index 148ac9e1..aba6d0f9 100644 --- a/pkg/utils/extensions/extensions.go +++ b/pkg/utils/extensions/extensions.go @@ -4,6 +4,7 @@ import ( "path" "strings" + "github.com/projectdiscovery/gologger" urlutil "github.com/projectdiscovery/utils/url" ) @@ -38,7 +39,10 @@ func NewValidator(extensionsMatch, extensionsFilter []string) *Validator { // ValidatePath returns true if an extension is allowed by the validator func (e *Validator) ValidatePath(item string) bool { var extension string - u, _ := urlutil.Parse(item) + u, err := urlutil.Parse(item) + if err != nil { + gologger.Warning().Msgf("validatepath: failed to parse url %v got %v", item, err) + } if u.Path != "" { extension = strings.ToLower(path.Ext(u.Path)) } else { diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index 170ab9b2..8b6ec0e3 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -4,13 +4,18 @@ import ( "strings" "github.com/lukasbob/srcset" - stringsutil "github.com/projectdiscovery/utils/strings" + "github.com/projectdiscovery/gologger" urlutil "github.com/projectdiscovery/utils/url" ) // IsURL returns true if a provided string is URL func IsURL(url string) bool { - return stringsutil.HasPrefixAny(url, "http://", "https://") + if value, err := urlutil.Parse(url); err == nil { + return value.Hostname() != "" + } else { + gologger.Debug().Msgf("IsURL: failed to parse url %v got %v", url, err) + } + return false } // ParseSRCSetTag parses srcset tag returning found URLs From 235552aaea73bdba7e6b57693020aa99676db340 Mon Sep 17 00:00:00 2001 From: Tarun Koyalwar Date: Fri, 7 Apr 2023 20:40:47 +0530 Subject: [PATCH 3/7] update skip message --- pkg/engine/common/base.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/engine/common/base.go b/pkg/engine/common/base.go index f2bc1012..1664b3e5 100644 --- a/pkg/engine/common/base.go +++ b/pkg/engine/common/base.go @@ -207,7 +207,7 @@ func (s *Shared) Do(crawlSession *CrawlSession, doRequest DoRequestFunc) error { continue } if !s.Options.ValidatePath(req.URL) { - gologger.Debug().Msgf("`%v` not a valid path. skipping", req.URL) + gologger.Debug().Msgf("skipping url with blacklisted extension %v", req.URL) continue } From ce8a482304b62de5d20d656384eaed8f9e383338 Mon Sep 17 00:00:00 2001 From: Tarun Koyalwar Date: Fri, 7 Apr 2023 20:48:55 +0530 Subject: [PATCH 4/7] fix failing test --- pkg/engine/parser/parser.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/engine/parser/parser.go b/pkg/engine/parser/parser.go index c836c5bd..b4a2a3b3 100644 --- a/pkg/engine/parser/parser.go +++ b/pkg/engine/parser/parser.go @@ -581,7 +581,7 @@ func bodyFormTagParser(resp *navigation.Response) (navigationRequests []*navigat switch method { case "GET": parsed.Params.Merge(queryValuesWriter) - req.URL = parsed.URL.String() + req.URL = parsed.String() case "POST": if multipartWriter != nil { req.Body = sb.String() From 11fabb16b0f2ca35eb3deedaedead891acdc047b Mon Sep 17 00:00:00 2001 From: Tarun Koyalwar Date: Fri, 7 Apr 2023 20:53:32 +0530 Subject: [PATCH 5/7] add .webp to extension filter --- pkg/utils/extensions/extensions.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/utils/extensions/extensions.go b/pkg/utils/extensions/extensions.go index aba6d0f9..87c61316 100644 --- a/pkg/utils/extensions/extensions.go +++ b/pkg/utils/extensions/extensions.go @@ -9,7 +9,7 @@ import ( ) // defaultDenylist is the default list of extensions to be denied -var defaultDenylist = []string{".3g2", ".3gp", ".7z", ".apk", ".arj", ".avi", ".axd", ".bmp", ".csv", ".deb", ".dll", ".doc", ".drv", ".eot", ".exe", ".flv", ".gif", ".gifv", ".gz", ".h264", ".ico", ".iso", ".jar", ".jpeg", ".jpg", ".lock", ".m4a", ".m4v", ".map", ".mkv", ".mov", ".mp3", ".mp4", ".mpeg", ".mpg", ".msi", ".ogg", ".ogm", ".ogv", ".otf", ".pdf", ".pkg", ".png", ".ppt", ".psd", ".rar", ".rm", ".rpm", ".svg", ".swf", ".sys", ".tar.gz", ".tar", ".tif", ".tiff", ".ttf", ".txt", ".vob", ".wav", ".webm", ".wmv", ".woff", ".woff2", ".xcf", ".xls", ".xlsx", ".zip"} +var defaultDenylist = []string{".3g2", ".3gp", ".7z", ".apk", ".arj", ".avi", ".axd", ".bmp", ".csv", ".deb", ".dll", ".doc", ".drv", ".eot", ".exe", ".flv", ".gif", ".gifv", ".gz", ".h264", ".ico", ".iso", ".jar", ".jpeg", ".jpg", ".lock", ".m4a", ".m4v", ".map", ".mkv", ".mov", ".mp3", ".mp4", ".mpeg", ".mpg", ".msi", ".ogg", ".ogm", ".ogv", ".otf", ".pdf", ".pkg", ".png", ".ppt", ".psd", ".rar", ".rm", ".rpm", ".svg", ".swf", ".sys", ".tar.gz", ".tar", ".tif", ".tiff", ".ttf", ".txt", ".vob", ".wav", ".webm", ".webp", ".wmv", ".woff", ".woff2", ".xcf", ".xls", ".xlsx", ".zip"} // Validator is a validator for file extension type Validator struct { From 9433ad0569bae3c63fe6e01499b44e2787ed2372 Mon Sep 17 00:00:00 2001 From: Tarun Koyalwar Date: Fri, 7 Apr 2023 21:23:45 +0530 Subject: [PATCH 6/7] update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 6afd7e59..7babecab 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,9 @@ katana -h This will display help for the tool. Here are all the switches it supports. ```console +Katana is a fast crawler focused on execution in automation +pipelines offering both headless and non-headless crawling. + Usage: ./katana [flags] From 58f2af9c564156f820045d766e1bb67b7eb147aa Mon Sep 17 00:00:00 2001 From: sandeep <8293321+ehsandeep@users.noreply.github.com> Date: Sat, 8 Apr 2023 13:23:53 +0530 Subject: [PATCH 7/7] misc update --- pkg/engine/hybrid/hybrid.go | 2 +- pkg/engine/standard/standard.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/engine/hybrid/hybrid.go b/pkg/engine/hybrid/hybrid.go index 2b562ed7..6c6b4396 100644 --- a/pkg/engine/hybrid/hybrid.go +++ b/pkg/engine/hybrid/hybrid.go @@ -147,7 +147,7 @@ func (c *Crawler) Crawl(rootURL string) error { } } - gologger.Info().Msgf("Started Headless Crawling Target: %v", rootURL) + gologger.Info().Msgf("Started headless crawling for => %v", rootURL) if err := c.Do(crawlSession, c.navigateRequest); err != nil { return errorutil.NewWithErr(err).WithTag("standard") } diff --git a/pkg/engine/standard/standard.go b/pkg/engine/standard/standard.go index ee00f6c0..b90429fb 100644 --- a/pkg/engine/standard/standard.go +++ b/pkg/engine/standard/standard.go @@ -33,7 +33,7 @@ func (c *Crawler) Crawl(rootURL string) error { return errorutil.NewWithErr(err).WithTag("standard") } defer crawlSession.CancelFunc() - gologger.Info().Msgf("Started Crawling Target: %v", rootURL) + gologger.Info().Msgf("Started standard crawling for => %v", rootURL) if err := c.Do(crawlSession, c.makeRequest); err != nil { return errorutil.NewWithErr(err).WithTag("standard") }