From a2cf86072bec4c9b5345206250cff3d64ff3b638 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Tue, 27 Feb 2024 12:34:30 +0300 Subject: [PATCH 01/15] introduce passive crawling --- cmd/example/example.go | 46 ++++++ cmd/katana/main.go | 4 + go.mod | 8 +- go.sum | 17 ++- internal/runner/runner.go | 3 + pkg/engine/passive/doc.go | 3 + pkg/engine/passive/extractor/extractor.go | 6 + .../passive/extractor/regex_extractor.go | 29 ++++ pkg/engine/passive/httpclient/httpclient.go | 124 ++++++++++++++++ pkg/engine/passive/passive.go | 103 +++++++++++++ pkg/engine/passive/registry.go | 12 ++ .../passive/source/commoncrawl/commoncrawl.go | 136 ++++++++++++++++++ pkg/engine/passive/source/source.go | 20 +++ .../source/waybackarchive/waybackarchive.go | 65 +++++++++ pkg/types/options.go | 4 + 15 files changed, 575 insertions(+), 5 deletions(-) create mode 100644 cmd/example/example.go create mode 100644 pkg/engine/passive/doc.go create mode 100644 pkg/engine/passive/extractor/extractor.go create mode 100644 pkg/engine/passive/extractor/regex_extractor.go create mode 100644 pkg/engine/passive/httpclient/httpclient.go create mode 100644 pkg/engine/passive/passive.go create mode 100644 pkg/engine/passive/registry.go create mode 100644 pkg/engine/passive/source/commoncrawl/commoncrawl.go create mode 100644 pkg/engine/passive/source/source.go create mode 100644 pkg/engine/passive/source/waybackarchive/waybackarchive.go diff --git a/cmd/example/example.go b/cmd/example/example.go new file mode 100644 index 00000000..046570b2 --- /dev/null +++ b/cmd/example/example.go @@ -0,0 +1,46 @@ +package main + +import ( + "math" + + "github.com/projectdiscovery/gologger" + "github.com/projectdiscovery/gologger/levels" + "github.com/projectdiscovery/katana/pkg/engine/standard" + "github.com/projectdiscovery/katana/pkg/output" + "github.com/projectdiscovery/katana/pkg/types" +) + +func main() { + gologger.DefaultLogger.SetMaxLevel(levels.LevelSilent) + + options := &types.Options{ + MaxDepth: 3, // Maximum depth to crawl + FieldScope: "rdn", // Crawling Scope Field + BodyReadSize: math.MaxInt, // Maximum response size to read + Timeout: 10, // Timeout is the time to wait for request in seconds + Concurrency: 10, // Concurrency is the number of concurrent crawling goroutines + OutputMatchRegex: []string{"policies"}, + Parallelism: 10, // Parallelism is the number of urls processing goroutines + Delay: 0, // Delay is the delay between each crawl requests in seconds + RateLimit: 150, // Maximum requests to send per second + Strategy: "depth-first", // Visit strategy (depth-first, breadth-first) + OnResult: func(result output.Result) { // Callback function to execute for result + gologger.Info().Msg(result.Request.URL) + }, + } + crawlerOptions, err := types.NewCrawlerOptions(options) + if err != nil { + gologger.Fatal().Msg(err.Error()) + } + defer crawlerOptions.Close() + crawler, err := standard.New(crawlerOptions) + if err != nil { + gologger.Fatal().Msg(err.Error()) + } + defer crawler.Close() + var input = "https://www.hackerone.com" + err = crawler.Crawl(input) + if err != nil { + gologger.Warning().Msgf("Could not crawl %s: %s", input, err.Error()) + } +} diff --git a/cmd/katana/main.go b/cmd/katana/main.go index 9faee12c..bc5f6f0b 100644 --- a/cmd/katana/main.go +++ b/cmd/katana/main.go @@ -126,6 +126,10 @@ pipelines offering both headless and non-headless crawling.`) flagSet.StringVarP(&options.ChromeWSUrl, "chrome-ws-url", "cwu", "", "use chrome browser instance launched elsewhere with the debugger listening at this URL"), flagSet.BoolVarP(&options.XhrExtraction, "xhr-extraction", "xhr", false, "extract xhr request url,method in jsonl output"), ) + flagSet.CreateGroup("passive", "Passive", + flagSet.BoolVarP(&options.Passive, "passive", "ps", false, "enable passive sources to discover target endpoints"), + flagSet.StringSliceVarP(&options.PassiveSource, "passive-source", "pss", nil, "passive source to use for url discovery (wayback,urlscan,commoncrawl,virustotal,alienvault)", goflags.NormalizedStringSliceOptions), + ) flagSet.CreateGroup("scope", "Scope", flagSet.StringSliceVarP(&options.Scope, "crawl-scope", "cs", nil, "in scope url regex to be followed by crawler", goflags.FileCommaSeparatedStringSliceOptions), diff --git a/go.mod b/go.mod index 8d1c4db8..b24916fa 100644 --- a/go.mod +++ b/go.mod @@ -13,12 +13,13 @@ require ( github.com/pkg/errors v0.9.1 github.com/projectdiscovery/dsl v0.0.45 github.com/projectdiscovery/fastdialer v0.0.59 - github.com/projectdiscovery/goflags v0.1.39 + github.com/projectdiscovery/goflags v0.1.40 github.com/projectdiscovery/gologger v1.1.12 github.com/projectdiscovery/hmap v0.0.40 github.com/projectdiscovery/mapcidr v1.1.16 github.com/projectdiscovery/ratelimit v0.0.30 github.com/projectdiscovery/retryablehttp-go v1.0.49 + github.com/projectdiscovery/useragent v0.0.39 github.com/projectdiscovery/utils v0.0.79 github.com/projectdiscovery/wappalyzergo v0.0.109 github.com/remeh/sizedwaitgroup v1.0.0 @@ -55,6 +56,7 @@ require ( github.com/kataras/jwt v0.1.8 // indirect github.com/klauspost/compress v1.16.7 // indirect github.com/klauspost/pgzip v1.2.5 // indirect + github.com/kr/pretty v0.3.1 // indirect github.com/lucasb-eyer/go-colorful v1.2.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.19 // indirect @@ -68,9 +70,11 @@ require ( github.com/projectdiscovery/asnmap v1.0.6 // indirect github.com/projectdiscovery/blackrock v0.0.1 // indirect github.com/projectdiscovery/gostruct v0.0.2 // indirect + github.com/projectdiscovery/stringsutil v0.0.2 // indirect github.com/quic-go/quic-go v0.37.7 // indirect github.com/refraction-networking/utls v1.5.4 // indirect github.com/rivo/uniseg v0.4.4 // indirect + github.com/rogpeppe/go-internal v1.12.0 // indirect github.com/sashabaranov/go-openai v1.14.2 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect github.com/smacker/go-tree-sitter v0.0.0-20220628134258-ac06e95cfa11 // indirect @@ -132,7 +136,7 @@ require ( github.com/zmap/zcrypto v0.0.0-20230422215203-9a665e1e9968 // indirect go.etcd.io/bbolt v1.3.7 // indirect golang.org/x/crypto v0.17.0 // indirect - golang.org/x/exp v0.0.0-20230626212559-97b1e661b5df // indirect + golang.org/x/exp v0.0.0-20230626212559-97b1e661b5df golang.org/x/mod v0.12.0 // indirect golang.org/x/sys v0.16.0 // indirect golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index 400c6b5d..9be99951 100644 --- a/go.sum +++ b/go.sum @@ -44,6 +44,7 @@ github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vc github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA= github.com/cnf/structhash v0.0.0-20201127153200-e1b16c1ebc08 h1:ox2F0PSMlrAAiAdknSRMDrAr8mfxPCfSZolH+/qQnyQ= github.com/cnf/structhash v0.0.0-20201127153200-e1b16c1ebc08/go.mod h1:pCxVEbcm3AMg7ejXyorUXi6HQCzOIBf7zEDVPtw0/U4= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -124,8 +125,9 @@ github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgo github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE= github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= -github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -190,6 +192,7 @@ github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+q github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk= github.com/pierrec/lz4/v4 v4.1.2 h1:qvY3YFXRQE/XB8MlLzJH7mSzBs74eA2gg52YTk6jUPM= github.com/pierrec/lz4/v4 v4.1.2/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -204,8 +207,8 @@ github.com/projectdiscovery/dsl v0.0.45 h1:fDMmBkpk5L7NhvihBxYoIch7UVBqLyKI6JzqK github.com/projectdiscovery/dsl v0.0.45/go.mod h1:G9mK7rQT5FkmNcMlCCfYxQrFcmSucSGWb2QgJZC0i0A= github.com/projectdiscovery/fastdialer v0.0.59 h1:5D0ws7JsYYMC8lm2VKgf91q0kAk6dzqoSHmLIuA2mww= github.com/projectdiscovery/fastdialer v0.0.59/go.mod h1:VdYQimFCHafSPP3c+OXgu1DP+FNLJq+U6Xk/ybfJ0/A= -github.com/projectdiscovery/goflags v0.1.39 h1:Dj8UY+FJEfxzT20+kyDkeTlGs+Ys19SiXWek7x1tPTs= -github.com/projectdiscovery/goflags v0.1.39/go.mod h1:ouB+HpJvhKZJjT8Bd13RYD6au7sQ7nGylbfad3JFy+E= +github.com/projectdiscovery/goflags v0.1.40 h1:yQCB0m0YxkonbkF7hRlRSGAeWEUJdJh5MLqW2gQFdps= +github.com/projectdiscovery/goflags v0.1.40/go.mod h1:lNrjzQPzSt06waiLmZuBfcusNdLHVvueCgGcSovE6d4= github.com/projectdiscovery/gologger v1.1.12 h1:uX/QkQdip4PubJjjG0+uk5DtyAi1ANPJUvpmimXqv4A= github.com/projectdiscovery/gologger v1.1.12/go.mod h1:DI8nywPLERS5mo8QEA9E7gd5HZ3Je14SjJBH3F5/kLw= github.com/projectdiscovery/gostruct v0.0.2 h1:s8gP8ApugGM4go1pA+sVlPDXaWqNP5BBDDSv7VEdG1M= @@ -222,6 +225,10 @@ github.com/projectdiscovery/retryabledns v1.0.57 h1:+DOL9xYSIx74FRrOIKKHVp5R9ci5 github.com/projectdiscovery/retryabledns v1.0.57/go.mod h1:qIigOcmO9d0Ce/z6mHzLl0Aiz2WJcNk2gUGhRcCQ1k4= github.com/projectdiscovery/retryablehttp-go v1.0.49 h1:mvlvl2kTN+ctpDIRlusVWui7eyFlElBoKTr8crS7yvY= github.com/projectdiscovery/retryablehttp-go v1.0.49/go.mod h1:VaJ7Au+1LP8C2u0qmx4NN1IdAxxkhoXpIcc9LAQzFo4= +github.com/projectdiscovery/stringsutil v0.0.2 h1:uzmw3IVLJSMW1kEg8eCStG/cGbYYZAja8BH3LqqJXMA= +github.com/projectdiscovery/stringsutil v0.0.2/go.mod h1:EJ3w6bC5fBYjVou6ryzodQq37D5c6qbAYQpGmAy+DC0= +github.com/projectdiscovery/useragent v0.0.39 h1:s2jyXdtjVo0MfYYkifx7irrOIoA0JhzhZaBkpcoWgV4= +github.com/projectdiscovery/useragent v0.0.39/go.mod h1:wO6GQImJ2IQ5K+GDggS/Rhg6IV9Z2Du6NbqC/um0g0w= github.com/projectdiscovery/utils v0.0.79 h1:ptO3Qo2e24SK5w5yvDk2whsvSEIk7gSX+RNhBQPRKqc= github.com/projectdiscovery/utils v0.0.79/go.mod h1:tBFlI+1warN7y7hKpFf6pqqOszvufENofy9Md0qlZQo= github.com/projectdiscovery/wappalyzergo v0.0.109 h1:BERfwTRn1dvB1tbhyc5m67R8VkC9zbVuPsEq4VEm07k= @@ -236,8 +243,12 @@ github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJ github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis= github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= +github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= +github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/rs/xid v1.5.0 h1:mKX4bl4iPYJtEIxp6CYiUuLQ/8DYMoz0PUdtGgMFRVc= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/sashabaranov/go-openai v1.14.2 h1:5DPTtR9JBjKPJS008/A409I5ntFhUPPGCmaAihcPRyo= diff --git a/internal/runner/runner.go b/internal/runner/runner.go index b8071fe6..26a0f6e4 100644 --- a/internal/runner/runner.go +++ b/internal/runner/runner.go @@ -9,6 +9,7 @@ import ( "github.com/projectdiscovery/katana/pkg/engine" "github.com/projectdiscovery/katana/pkg/engine/hybrid" "github.com/projectdiscovery/katana/pkg/engine/parser" + "github.com/projectdiscovery/katana/pkg/engine/passive" "github.com/projectdiscovery/katana/pkg/engine/standard" "github.com/projectdiscovery/katana/pkg/types" "github.com/projectdiscovery/mapcidr" @@ -98,6 +99,8 @@ func New(options *types.Options) (*Runner, error) { switch { case options.Headless: crawler, err = hybrid.New(crawlerOptions) + case options.Passive: + crawler, err = passive.New(crawlerOptions) default: crawler, err = standard.New(crawlerOptions) } diff --git a/pkg/engine/passive/doc.go b/pkg/engine/passive/doc.go new file mode 100644 index 00000000..915dd7b4 --- /dev/null +++ b/pkg/engine/passive/doc.go @@ -0,0 +1,3 @@ +// Package passive implements the functionality for a non-headless crawler. +// It uses net/http for making requests and goquery for scraping web page HTML. +package passive diff --git a/pkg/engine/passive/extractor/extractor.go b/pkg/engine/passive/extractor/extractor.go new file mode 100644 index 00000000..46610eb0 --- /dev/null +++ b/pkg/engine/passive/extractor/extractor.go @@ -0,0 +1,6 @@ +package extractor + +// UrlExtractor is an interface that defines the contract for domain extraction. +type UrlExtractor interface { + Extract(text string) []string +} diff --git a/pkg/engine/passive/extractor/regex_extractor.go b/pkg/engine/passive/extractor/regex_extractor.go new file mode 100644 index 00000000..c00a9221 --- /dev/null +++ b/pkg/engine/passive/extractor/regex_extractor.go @@ -0,0 +1,29 @@ +package extractor + +import ( + "regexp" + "strings" +) + +// RegexUrlExtractor is a concrete implementation of the UrlExtractor interface, using regex for extraction. +type RegexUrlExtractor struct { + extractor *regexp.Regexp +} + +// NewRegexUrlExtractor creates a new regular expression to extract urls +func NewRegexUrlExtractor() (*RegexUrlExtractor, error) { + extractor, err := regexp.Compile(`(?:http|https)?://(?:www\.)?[a-zA-Z0-9./?=_%:-]*`) + if err != nil { + return nil, err + } + return &RegexUrlExtractor{extractor: extractor}, nil +} + +// Extract implements the UrlExtractor interface, using the regex to find urls in the given text. +func (re *RegexUrlExtractor) Extract(text string) []string { + matches := re.extractor.FindAllString(text, -1) + for i, match := range matches { + matches[i] = strings.ToLower(match) + } + return matches +} diff --git a/pkg/engine/passive/httpclient/httpclient.go b/pkg/engine/passive/httpclient/httpclient.go new file mode 100644 index 00000000..e3c671da --- /dev/null +++ b/pkg/engine/passive/httpclient/httpclient.go @@ -0,0 +1,124 @@ +package httpclient + +import ( + "bytes" + "context" + "crypto/tls" + "fmt" + "io" + "net" + "net/http" + "net/url" + "time" + + "github.com/projectdiscovery/gologger" + "github.com/projectdiscovery/useragent" +) + +type HttpClient struct { + Client *http.Client +} + +type BasicAuth struct { + Username string + Password string +} + +func NewHttpClient(timeout int) *HttpClient { + Transport := &http.Transport{ + MaxIdleConns: 100, + MaxIdleConnsPerHost: 100, + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + Dial: (&net.Dialer{ + Timeout: time.Duration(timeout) * time.Second, + }).Dial, + } + + client := &http.Client{ + Transport: Transport, + Timeout: time.Duration(timeout) * time.Second, + } + + httpClient := &HttpClient{Client: client} + + return httpClient +} + +func (hc *HttpClient) Get(ctx context.Context, getURL, cookies string, headers map[string]string) (*http.Response, error) { + return hc.HTTPRequest(ctx, http.MethodGet, getURL, cookies, headers, nil, BasicAuth{}) +} + +func (hc *HttpClient) SimpleGet(ctx context.Context, getURL string) (*http.Response, error) { + return hc.HTTPRequest(ctx, http.MethodGet, getURL, "", map[string]string{}, nil, BasicAuth{}) +} + +func (hc *HttpClient) Post(ctx context.Context, postURL, cookies string, headers map[string]string, body io.Reader) (*http.Response, error) { + return hc.HTTPRequest(ctx, http.MethodPost, postURL, cookies, headers, body, BasicAuth{}) +} + +func (hc *HttpClient) SimplePost(ctx context.Context, postURL, contentType string, body io.Reader) (*http.Response, error) { + return hc.HTTPRequest(ctx, http.MethodPost, postURL, "", map[string]string{"Content-Type": contentType}, body, BasicAuth{}) +} + +func (hc *HttpClient) HTTPRequest(ctx context.Context, method, requestURL, cookies string, headers map[string]string, body io.Reader, basicAuth BasicAuth) (*http.Response, error) { + req, err := http.NewRequestWithContext(ctx, method, requestURL, body) + if err != nil { + return nil, err + } + + userAgent := useragent.PickRandom() + req.Header.Set("User-Agent", userAgent.String()) + req.Header.Set("Accept", "*/*") + req.Header.Set("Accept-Language", "en") + req.Header.Set("Connection", "close") + + if basicAuth.Username != "" || basicAuth.Password != "" { + req.SetBasicAuth(basicAuth.Username, basicAuth.Password) + } + + if cookies != "" { + req.Header.Set("Cookie", cookies) + } + + for key, value := range headers { + req.Header.Set(key, value) + } + + return httpRequestWrapper(hc.Client, req) +} + +func (hc *HttpClient) DiscardHTTPResponse(response *http.Response) { + if response != nil { + _, err := io.Copy(io.Discard, response.Body) + if err != nil { + gologger.Warning().Msgf("Could not discard response body: %s\n", err) + return + } + response.Body.Close() + } +} + +func (hc *HttpClient) Close() { + hc.Client.CloseIdleConnections() +} + +func httpRequestWrapper(client *http.Client, request *http.Request) (*http.Response, error) { + response, err := client.Do(request) + if err != nil { + return nil, err + } + + if response.StatusCode != http.StatusOK { + requestURL, _ := url.QueryUnescape(request.URL.String()) + + gologger.Debug().MsgFunc(func() string { + buffer := new(bytes.Buffer) + _, _ = buffer.ReadFrom(response.Body) + return fmt.Sprintf("Response for failed request against %s:\n%s", requestURL, buffer.String()) + }) + return response, fmt.Errorf("unexpected status code %d received from %s", response.StatusCode, requestURL) + } + return response, nil +} diff --git a/pkg/engine/passive/passive.go b/pkg/engine/passive/passive.go new file mode 100644 index 00000000..f54b04f5 --- /dev/null +++ b/pkg/engine/passive/passive.go @@ -0,0 +1,103 @@ +package passive + +import ( + "context" + "fmt" + "strings" + "sync" + + "github.com/projectdiscovery/gologger" + "github.com/projectdiscovery/katana/pkg/engine/common" + "github.com/projectdiscovery/katana/pkg/engine/passive/httpclient" + "github.com/projectdiscovery/katana/pkg/engine/passive/source" + "github.com/projectdiscovery/katana/pkg/navigation" + "github.com/projectdiscovery/katana/pkg/types" + "github.com/projectdiscovery/katana/pkg/utils" + errorutil "github.com/projectdiscovery/utils/errors" + urlutil "github.com/projectdiscovery/utils/url" + "golang.org/x/exp/maps" +) + +// Crawler is a passive crawler instance +type Crawler struct { + *common.Shared + sources []source.Source + httpClient *httpclient.HttpClient +} + +// New returns a new passive crawler instance +func New(options *types.CrawlerOptions) (*Crawler, error) { + shared, err := common.NewShared(options) + if err != nil { + return nil, errorutil.NewWithErr(err).WithTag("passive") + } + + sources := make(map[string]source.Source, len(Sources)) + if len(options.Options.PassiveSource) > 0 { + for _, source := range options.Options.PassiveSource { + if s, ok := Sources[source]; ok { + sources[source] = s + } + } + } else { + sources = Sources + } + + if len(sources) == 0 { + gologger.Fatal().Msg("No sources selected for this search") + } + + gologger.Debug().Msgf(fmt.Sprintf("Selected source(s) for this crawl: %s", strings.Join(maps.Keys(sources), ", "))) + + httpClient := httpclient.NewHttpClient(options.Options.Timeout) + return &Crawler{Shared: shared, sources: maps.Values(sources), httpClient: httpClient}, nil +} + +// Close closes the crawler process +func (c *Crawler) Close() error { + return nil +} + +// Crawl crawls a URL with the specified options +func (c *Crawler) Crawl(rootURL string) error { + results := make(chan source.Result) + go func() { + defer close(results) + + ctx := context.Background() + wg := &sync.WaitGroup{} + for _, s := range c.sources { + wg.Add(1) + go func(source source.Source) { + for resp := range source.Run(ctx, c.Shared, rootURL) { + results <- resp + } + wg.Done() + }(s) + } + wg.Wait() + }() + + URLs := map[string]struct{}{rootURL: {}} + for result := range results { + URLs[result.Value] = struct{}{} + } + + rootUrlParsed, _ := urlutil.ParseURL(rootURL, true) + for URL := range URLs { + if !utils.IsURL(URL) { + gologger.Debug().Msgf("`%v` not a url. skipping", URL) + continue + } + + if ok, err := c.Options.ValidateScope(URL, rootUrlParsed.Hostname()); err != nil || !ok { + gologger.Debug().Msgf("`%v` not in scope. skipping", URL) + continue + } + + req := &navigation.Request{Method: "GET", URL: URL} + resp := &navigation.Response{} + c.Output(req, resp, nil) + } + return nil +} diff --git a/pkg/engine/passive/registry.go b/pkg/engine/passive/registry.go new file mode 100644 index 00000000..8331e32a --- /dev/null +++ b/pkg/engine/passive/registry.go @@ -0,0 +1,12 @@ +package passive + +import ( + "github.com/projectdiscovery/katana/pkg/engine/passive/source" + "github.com/projectdiscovery/katana/pkg/engine/passive/source/commoncrawl" + "github.com/projectdiscovery/katana/pkg/engine/passive/source/waybackarchive" +) + +var Sources = map[string]source.Source{ + "waybackarchive": &waybackarchive.Source{}, + "commoncrawl": &commoncrawl.Source{}, +} diff --git a/pkg/engine/passive/source/commoncrawl/commoncrawl.go b/pkg/engine/passive/source/commoncrawl/commoncrawl.go new file mode 100644 index 00000000..08a16c43 --- /dev/null +++ b/pkg/engine/passive/source/commoncrawl/commoncrawl.go @@ -0,0 +1,136 @@ +// Package commoncrawl logic +package commoncrawl + +import ( + "bufio" + "context" + "fmt" + "net/url" + "strconv" + "strings" + "time" + + jsoniter "github.com/json-iterator/go" + + "github.com/projectdiscovery/katana/pkg/engine/common" + "github.com/projectdiscovery/katana/pkg/engine/passive/extractor" + "github.com/projectdiscovery/katana/pkg/engine/passive/httpclient" + "github.com/projectdiscovery/katana/pkg/engine/passive/source" +) + +const ( + indexURL = "https://index.commoncrawl.org/collinfo.json" + maxYearsBack = 5 +) + +var year = time.Now().Year() + +type indexResponse struct { + ID string `json:"id"` + APIURL string `json:"cdx-api"` +} + +type Source struct { +} + +func (s *Source) Run(ctx context.Context, sharedCtx *common.Shared, rootUrl string) <-chan source.Result { + results := make(chan source.Result) + + go func() { + defer close(results) + + httpClient := httpclient.NewHttpClient(sharedCtx.Options.Options.Timeout) + resp, err := httpClient.SimpleGet(ctx, indexURL) + if err != nil { + results <- source.Result{Source: s.Name(), Error: err} + httpClient.DiscardHTTPResponse(resp) + return + } + + var indexes []indexResponse + err = jsoniter.NewDecoder(resp.Body).Decode(&indexes) + if err != nil { + results <- source.Result{Source: s.Name(), Error: err} + resp.Body.Close() + return + } + resp.Body.Close() + + years := make([]string, 0) + for i := 0; i < maxYearsBack; i++ { + years = append(years, strconv.Itoa(year-i)) + } + + searchIndexes := make(map[string]string) + for _, year := range years { + for _, index := range indexes { + if strings.Contains(index.ID, year) { + if _, ok := searchIndexes[year]; !ok { + searchIndexes[year] = index.APIURL + break + } + } + } + } + + urlExtractor, _ := extractor.NewRegexUrlExtractor() + for _, apiURL := range searchIndexes { + further := s.getSubdomains(ctx, apiURL, rootUrl, httpClient, urlExtractor, results) + if !further { + break + } + } + }() + + return results +} + +func (s *Source) Name() string { + return "commoncrawl" +} + +func (s *Source) NeedsKey() bool { + return false +} + +func (s *Source) AddApiKeys(_ []string) { + // no key needed +} + +func (s *Source) getSubdomains(ctx context.Context, searchURL, rootURL string, httpClient *httpclient.HttpClient, urlExtractor *extractor.RegexUrlExtractor, results chan source.Result) bool { + for { + select { + case <-ctx.Done(): + return false + default: + var headers = map[string]string{"Host": "index.commoncrawl.org"} + resp, err := httpClient.Get(ctx, fmt.Sprintf("%s?url=*.%s", searchURL, rootURL), "", headers) + if err != nil { + results <- source.Result{Source: s.Name(), Error: err} + httpClient.DiscardHTTPResponse(resp) + return false + } + + scanner := bufio.NewScanner(resp.Body) + + for scanner.Scan() { + line := scanner.Text() + if line == "" { + continue + } + line, _ = url.QueryUnescape(line) + for _, extractedURL := range urlExtractor.Extract(line) { + // fix for triple encoded URL + extractedURL = strings.ToLower(extractedURL) + extractedURL = strings.TrimPrefix(extractedURL, "25") + extractedURL = strings.TrimPrefix(extractedURL, "2f") + if extractedURL != "" { + results <- source.Result{Source: s.Name(), Value: extractedURL} + } + } + } + resp.Body.Close() + return true + } + } +} diff --git a/pkg/engine/passive/source/source.go b/pkg/engine/passive/source/source.go new file mode 100644 index 00000000..71752659 --- /dev/null +++ b/pkg/engine/passive/source/source.go @@ -0,0 +1,20 @@ +package source + +import ( + "context" + + "github.com/projectdiscovery/katana/pkg/engine/common" +) + +type Source interface { + Run(context.Context, *common.Shared, string) <-chan Result + Name() string + NeedsKey() bool + AddApiKeys([]string) +} + +type Result struct { + Source string + Value string + Error error +} diff --git a/pkg/engine/passive/source/waybackarchive/waybackarchive.go b/pkg/engine/passive/source/waybackarchive/waybackarchive.go new file mode 100644 index 00000000..f3ffd620 --- /dev/null +++ b/pkg/engine/passive/source/waybackarchive/waybackarchive.go @@ -0,0 +1,65 @@ +package waybackarchive + +import ( + "bufio" + "context" + "fmt" + "net/url" + "strings" + + "github.com/projectdiscovery/katana/pkg/engine/common" + "github.com/projectdiscovery/katana/pkg/engine/passive/extractor" + "github.com/projectdiscovery/katana/pkg/engine/passive/httpclient" + "github.com/projectdiscovery/katana/pkg/engine/passive/source" +) + +type Source struct { +} + +func (s *Source) Run(ctx context.Context, sharedCtx *common.Shared, rootUrl string) <-chan source.Result { + results := make(chan source.Result) + go func() { + defer close(results) + + httpClient := httpclient.NewHttpClient(sharedCtx.Options.Options.Timeout) + resp, err := httpClient.Get(ctx, fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=*.%s/*&output=txt&fl=original&collapse=urlkey", rootUrl), "", nil) + if err != nil { + results <- source.Result{Source: s.Name(), Error: err} + return + } + defer resp.Body.Close() + + scanner := bufio.NewScanner(resp.Body) + urlExtractor, _ := extractor.NewRegexUrlExtractor() + for scanner.Scan() { + line := scanner.Text() + if line == "" { + continue + } + line, _ = url.QueryUnescape(line) + for _, extractedURL := range urlExtractor.Extract(line) { + // fix for triple encoded URL + extractedURL = strings.ToLower(extractedURL) + extractedURL = strings.TrimPrefix(extractedURL, "25") + extractedURL = strings.TrimPrefix(extractedURL, "2f") + + results <- source.Result{Source: s.Name(), Value: extractedURL} + } + + } + }() + + return results +} + +func (s *Source) Name() string { + return "waybackarchive" +} + +func (s *Source) NeedsKey() bool { + return false +} + +func (s *Source) AddApiKeys(_ []string) { + // no key needed +} diff --git a/pkg/types/options.go b/pkg/types/options.go index 1963830b..af6bc6fb 100644 --- a/pkg/types/options.go +++ b/pkg/types/options.go @@ -124,6 +124,10 @@ type Options struct { HeadlessNoIncognito bool // XhrExtraction extract xhr requests XhrExtraction bool + // Passive enables passive crawling + Passive bool + // PassiveSource is the list of sources for passive crawling + PassiveSource goflags.StringSlice // HealthCheck determines if a self-healthcheck should be performed HealthCheck bool // ErrorLogFile specifies a file to write with the errors of all requests From f7d2a5baa36340741fd4105c04862bcd1ee071e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Tue, 27 Feb 2024 12:38:22 +0300 Subject: [PATCH 02/15] remove example --- cmd/example/example.go | 46 ------------------------------------------ 1 file changed, 46 deletions(-) delete mode 100644 cmd/example/example.go diff --git a/cmd/example/example.go b/cmd/example/example.go deleted file mode 100644 index 046570b2..00000000 --- a/cmd/example/example.go +++ /dev/null @@ -1,46 +0,0 @@ -package main - -import ( - "math" - - "github.com/projectdiscovery/gologger" - "github.com/projectdiscovery/gologger/levels" - "github.com/projectdiscovery/katana/pkg/engine/standard" - "github.com/projectdiscovery/katana/pkg/output" - "github.com/projectdiscovery/katana/pkg/types" -) - -func main() { - gologger.DefaultLogger.SetMaxLevel(levels.LevelSilent) - - options := &types.Options{ - MaxDepth: 3, // Maximum depth to crawl - FieldScope: "rdn", // Crawling Scope Field - BodyReadSize: math.MaxInt, // Maximum response size to read - Timeout: 10, // Timeout is the time to wait for request in seconds - Concurrency: 10, // Concurrency is the number of concurrent crawling goroutines - OutputMatchRegex: []string{"policies"}, - Parallelism: 10, // Parallelism is the number of urls processing goroutines - Delay: 0, // Delay is the delay between each crawl requests in seconds - RateLimit: 150, // Maximum requests to send per second - Strategy: "depth-first", // Visit strategy (depth-first, breadth-first) - OnResult: func(result output.Result) { // Callback function to execute for result - gologger.Info().Msg(result.Request.URL) - }, - } - crawlerOptions, err := types.NewCrawlerOptions(options) - if err != nil { - gologger.Fatal().Msg(err.Error()) - } - defer crawlerOptions.Close() - crawler, err := standard.New(crawlerOptions) - if err != nil { - gologger.Fatal().Msg(err.Error()) - } - defer crawler.Close() - var input = "https://www.hackerone.com" - err = crawler.Crawl(input) - if err != nil { - gologger.Warning().Msgf("Could not crawl %s: %s", input, err.Error()) - } -} From 2c8d76a81f0c4f88db994ed954d315794b26e37a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Wed, 28 Feb 2024 14:32:28 +0300 Subject: [PATCH 03/15] add alienvault --- pkg/engine/passive/passive.go | 21 ++--- pkg/engine/passive/registry.go | 2 + .../passive/source/alienvault/alienvault.go | 81 +++++++++++++++++++ 3 files changed, 91 insertions(+), 13 deletions(-) create mode 100644 pkg/engine/passive/source/alienvault/alienvault.go diff --git a/pkg/engine/passive/passive.go b/pkg/engine/passive/passive.go index f54b04f5..1383e490 100644 --- a/pkg/engine/passive/passive.go +++ b/pkg/engine/passive/passive.go @@ -60,6 +60,7 @@ func (c *Crawler) Close() error { // Crawl crawls a URL with the specified options func (c *Crawler) Crawl(rootURL string) error { + rootUrlParsed, _ := urlutil.ParseURL(rootURL, true) results := make(chan source.Result) go func() { defer close(results) @@ -69,8 +70,8 @@ func (c *Crawler) Crawl(rootURL string) error { for _, s := range c.sources { wg.Add(1) go func(source source.Source) { - for resp := range source.Run(ctx, c.Shared, rootURL) { - results <- resp + for result := range source.Run(ctx, c.Shared, rootURL) { + results <- result } wg.Done() }(s) @@ -78,24 +79,18 @@ func (c *Crawler) Crawl(rootURL string) error { wg.Wait() }() - URLs := map[string]struct{}{rootURL: {}} for result := range results { - URLs[result.Value] = struct{}{} - } - - rootUrlParsed, _ := urlutil.ParseURL(rootURL, true) - for URL := range URLs { - if !utils.IsURL(URL) { - gologger.Debug().Msgf("`%v` not a url. skipping", URL) + if !utils.IsURL(result.Value) { + gologger.Debug().Msgf("`%v` not a url. skipping", result.Value) continue } - if ok, err := c.Options.ValidateScope(URL, rootUrlParsed.Hostname()); err != nil || !ok { - gologger.Debug().Msgf("`%v` not in scope. skipping", URL) + if ok, err := c.Options.ValidateScope(result.Value, rootUrlParsed.Hostname()); err != nil || !ok { + gologger.Debug().Msgf("`%v` not in scope. skipping", result.Value) continue } - req := &navigation.Request{Method: "GET", URL: URL} + req := &navigation.Request{Method: "GET", URL: result.Value} resp := &navigation.Response{} c.Output(req, resp, nil) } diff --git a/pkg/engine/passive/registry.go b/pkg/engine/passive/registry.go index 8331e32a..b3a7f1d6 100644 --- a/pkg/engine/passive/registry.go +++ b/pkg/engine/passive/registry.go @@ -2,6 +2,7 @@ package passive import ( "github.com/projectdiscovery/katana/pkg/engine/passive/source" + "github.com/projectdiscovery/katana/pkg/engine/passive/source/alienvault" "github.com/projectdiscovery/katana/pkg/engine/passive/source/commoncrawl" "github.com/projectdiscovery/katana/pkg/engine/passive/source/waybackarchive" ) @@ -9,4 +10,5 @@ import ( var Sources = map[string]source.Source{ "waybackarchive": &waybackarchive.Source{}, "commoncrawl": &commoncrawl.Source{}, + "alienvault": &alienvault.Source{}, } diff --git a/pkg/engine/passive/source/alienvault/alienvault.go b/pkg/engine/passive/source/alienvault/alienvault.go new file mode 100644 index 00000000..1c0f797b --- /dev/null +++ b/pkg/engine/passive/source/alienvault/alienvault.go @@ -0,0 +1,81 @@ +package alienvault + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/projectdiscovery/katana/pkg/engine/common" + "github.com/projectdiscovery/katana/pkg/engine/passive/httpclient" + "github.com/projectdiscovery/katana/pkg/engine/passive/source" + urlutil "github.com/projectdiscovery/utils/url" +) + +type alienvaultResponse struct { + URLList []url `json:"url_list"` + HasNext bool `json:"has_next"` +} + +type url struct { + URL string `json:"url"` +} + +type Source struct { +} + +func (s *Source) Run(ctx context.Context, sharedCtx *common.Shared, rootUrl string) <-chan source.Result { + results := make(chan source.Result) + + go func() { + defer close(results) + + if parsedRootUrl, err := urlutil.Parse(rootUrl); err == nil { + rootUrl = parsedRootUrl.Hostname() + } + + page := 1 + for { + httpClient := httpclient.NewHttpClient(sharedCtx.Options.Options.Timeout) + apiURL := fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/url_list?page=%d", rootUrl, page) + resp, err := httpClient.SimpleGet(ctx, apiURL) + if err != nil && resp == nil { + results <- source.Result{Source: s.Name(), Error: err} + httpClient.DiscardHTTPResponse(resp) + return + } + + var response alienvaultResponse + // Get the response body and decode + err = json.NewDecoder(resp.Body).Decode(&response) + if err != nil { + results <- source.Result{Source: s.Name(), Error: err} + resp.Body.Close() + return + } + resp.Body.Close() + + for _, record := range response.URLList { + results <- source.Result{Source: s.Name(), Value: record.URL} + } + + if !response.HasNext { + break + } + page++ + } + }() + + return results +} + +func (s *Source) Name() string { + return "alienvault" +} + +func (s *Source) NeedsKey() bool { + return false +} + +func (s *Source) AddApiKeys(_ []string) { + // no key needed +} From 9e519e45086d9a776f5c53112495f58aee75d8a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Thu, 29 Feb 2024 09:27:54 +0300 Subject: [PATCH 04/15] fix release-test linux workflow --- .github/workflows/release-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-test.yml b/.github/workflows/release-test.yml index 9d291aae..bac12da3 100644 --- a/.github/workflows/release-test.yml +++ b/.github/workflows/release-test.yml @@ -46,7 +46,7 @@ jobs: # todo: musl compatible? - name: Install Dependences - run: sudo apt install gcc-aarch64-linux-gnu + run: sudo apt update && sudo apt install gcc-aarch64-linux-gnu - name: release test uses: goreleaser/goreleaser-action@v4 From 96cc9166a6da2fa2698d71a0b201dd8b71c580fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Thu, 29 Feb 2024 09:33:04 +0300 Subject: [PATCH 05/15] minor --- pkg/engine/passive/source/alienvault/alienvault.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/engine/passive/source/alienvault/alienvault.go b/pkg/engine/passive/source/alienvault/alienvault.go index 1c0f797b..d8a76989 100644 --- a/pkg/engine/passive/source/alienvault/alienvault.go +++ b/pkg/engine/passive/source/alienvault/alienvault.go @@ -33,9 +33,9 @@ func (s *Source) Run(ctx context.Context, sharedCtx *common.Shared, rootUrl stri rootUrl = parsedRootUrl.Hostname() } + httpClient := httpclient.NewHttpClient(sharedCtx.Options.Options.Timeout) page := 1 for { - httpClient := httpclient.NewHttpClient(sharedCtx.Options.Options.Timeout) apiURL := fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/url_list?page=%d", rootUrl, page) resp, err := httpClient.SimpleGet(ctx, apiURL) if err != nil && resp == nil { From a1128e81bc195d4f112da96fa9472e3edce7f0e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Sun, 3 Mar 2024 17:57:08 +0300 Subject: [PATCH 06/15] omit empty --- pkg/engine/passive/passive.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pkg/engine/passive/passive.go b/pkg/engine/passive/passive.go index 1383e490..6dcda0be 100644 --- a/pkg/engine/passive/passive.go +++ b/pkg/engine/passive/passive.go @@ -90,9 +90,7 @@ func (c *Crawler) Crawl(rootURL string) error { continue } - req := &navigation.Request{Method: "GET", URL: result.Value} - resp := &navigation.Response{} - c.Output(req, resp, nil) + c.Output(&navigation.Request{Method: "GET", URL: result.Value}, nil, nil) } return nil } From f7a3da0e735779b335d2dd1b91534be152430525 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Sun, 3 Mar 2024 18:07:33 +0300 Subject: [PATCH 07/15] add passive ref --- pkg/engine/common/base.go | 15 ++++++++------- pkg/engine/passive/passive.go | 4 +++- .../passive/source/alienvault/alienvault.go | 2 +- .../passive/source/commoncrawl/commoncrawl.go | 9 +++++---- pkg/engine/passive/source/source.go | 7 ++++--- .../source/waybackarchive/waybackarchive.go | 5 +++-- pkg/navigation/response.go | 5 +++++ pkg/output/result.go | 9 +++++---- 8 files changed, 34 insertions(+), 22 deletions(-) diff --git a/pkg/engine/common/base.go b/pkg/engine/common/base.go index 3754ace1..97a0b2cc 100644 --- a/pkg/engine/common/base.go +++ b/pkg/engine/common/base.go @@ -72,7 +72,7 @@ func (s *Shared) Enqueue(queue *queue.Queue, navigationRequests ...*navigation.R // if the user requested anyway out of scope items // they are sent to output without visiting if s.Options.Options.DisplayOutScope { - s.Output(nr, nil, ErrOutOfScope) + s.Output(nr, nil, nil, ErrOutOfScope) } continue } @@ -95,17 +95,18 @@ func (s *Shared) ValidateScope(URL string, root string) bool { return err == nil && scopeValidated } -func (s *Shared) Output(navigationRequest *navigation.Request, navigationResponse *navigation.Response, err error) { +func (s *Shared) Output(navigationRequest *navigation.Request, navigationResponse *navigation.Response, passiveReference *navigation.PassiveReference, err error) { var errData string if err != nil { errData = err.Error() } // Write the found result to output result := &output.Result{ - Timestamp: time.Now(), - Request: navigationRequest, - Response: navigationResponse, - Error: errData, + Timestamp: time.Now(), + Request: navigationRequest, + Response: navigationResponse, + PassiveReference: passiveReference, + Error: errData, } outputErr := s.Options.OutputWriter.Write(result) @@ -223,7 +224,7 @@ func (s *Shared) Do(crawlSession *CrawlSession, doRequest DoRequestFunc) error { resp, err := doRequest(crawlSession, req) - s.Output(req, resp, err) + s.Output(req, resp, nil, err) if err != nil { gologger.Warning().Msgf("Could not request seed URL %s: %s\n", req.URL, err) diff --git a/pkg/engine/passive/passive.go b/pkg/engine/passive/passive.go index 6dcda0be..93be53f1 100644 --- a/pkg/engine/passive/passive.go +++ b/pkg/engine/passive/passive.go @@ -90,7 +90,9 @@ func (c *Crawler) Crawl(rootURL string) error { continue } - c.Output(&navigation.Request{Method: "GET", URL: result.Value}, nil, nil) + req := &navigation.Request{Method: "GET", URL: result.Value} + passiveReference := &navigation.PassiveReference{Source: result.Source, Reference: result.Reference} + c.Output(req, nil, passiveReference, nil) } return nil } diff --git a/pkg/engine/passive/source/alienvault/alienvault.go b/pkg/engine/passive/source/alienvault/alienvault.go index d8a76989..8a4e25d3 100644 --- a/pkg/engine/passive/source/alienvault/alienvault.go +++ b/pkg/engine/passive/source/alienvault/alienvault.go @@ -55,7 +55,7 @@ func (s *Source) Run(ctx context.Context, sharedCtx *common.Shared, rootUrl stri resp.Body.Close() for _, record := range response.URLList { - results <- source.Result{Source: s.Name(), Value: record.URL} + results <- source.Result{Source: s.Name(), Value: record.URL, Reference: apiURL} } if !response.HasNext { diff --git a/pkg/engine/passive/source/commoncrawl/commoncrawl.go b/pkg/engine/passive/source/commoncrawl/commoncrawl.go index 08a16c43..3c072c60 100644 --- a/pkg/engine/passive/source/commoncrawl/commoncrawl.go +++ b/pkg/engine/passive/source/commoncrawl/commoncrawl.go @@ -75,7 +75,7 @@ func (s *Source) Run(ctx context.Context, sharedCtx *common.Shared, rootUrl stri urlExtractor, _ := extractor.NewRegexUrlExtractor() for _, apiURL := range searchIndexes { - further := s.getSubdomains(ctx, apiURL, rootUrl, httpClient, urlExtractor, results) + further := s.getURLs(ctx, apiURL, rootUrl, httpClient, urlExtractor, results) if !further { break } @@ -97,14 +97,15 @@ func (s *Source) AddApiKeys(_ []string) { // no key needed } -func (s *Source) getSubdomains(ctx context.Context, searchURL, rootURL string, httpClient *httpclient.HttpClient, urlExtractor *extractor.RegexUrlExtractor, results chan source.Result) bool { +func (s *Source) getURLs(ctx context.Context, searchURL, rootURL string, httpClient *httpclient.HttpClient, urlExtractor *extractor.RegexUrlExtractor, results chan source.Result) bool { for { select { case <-ctx.Done(): return false default: var headers = map[string]string{"Host": "index.commoncrawl.org"} - resp, err := httpClient.Get(ctx, fmt.Sprintf("%s?url=*.%s", searchURL, rootURL), "", headers) + currentSearchURL := fmt.Sprintf("%s?url=*.%s", searchURL, rootURL) + resp, err := httpClient.Get(ctx, currentSearchURL, "", headers) if err != nil { results <- source.Result{Source: s.Name(), Error: err} httpClient.DiscardHTTPResponse(resp) @@ -125,7 +126,7 @@ func (s *Source) getSubdomains(ctx context.Context, searchURL, rootURL string, h extractedURL = strings.TrimPrefix(extractedURL, "25") extractedURL = strings.TrimPrefix(extractedURL, "2f") if extractedURL != "" { - results <- source.Result{Source: s.Name(), Value: extractedURL} + results <- source.Result{Source: s.Name(), Value: extractedURL, Reference: currentSearchURL} } } } diff --git a/pkg/engine/passive/source/source.go b/pkg/engine/passive/source/source.go index 71752659..db5bc897 100644 --- a/pkg/engine/passive/source/source.go +++ b/pkg/engine/passive/source/source.go @@ -14,7 +14,8 @@ type Source interface { } type Result struct { - Source string - Value string - Error error + Source string + Value string + Reference string + Error error } diff --git a/pkg/engine/passive/source/waybackarchive/waybackarchive.go b/pkg/engine/passive/source/waybackarchive/waybackarchive.go index f3ffd620..5fa1c860 100644 --- a/pkg/engine/passive/source/waybackarchive/waybackarchive.go +++ b/pkg/engine/passive/source/waybackarchive/waybackarchive.go @@ -22,7 +22,8 @@ func (s *Source) Run(ctx context.Context, sharedCtx *common.Shared, rootUrl stri defer close(results) httpClient := httpclient.NewHttpClient(sharedCtx.Options.Options.Timeout) - resp, err := httpClient.Get(ctx, fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=*.%s/*&output=txt&fl=original&collapse=urlkey", rootUrl), "", nil) + searchURL := fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=*.%s/*&output=txt&fl=original&collapse=urlkey", rootUrl) + resp, err := httpClient.Get(ctx, searchURL, "", nil) if err != nil { results <- source.Result{Source: s.Name(), Error: err} return @@ -43,7 +44,7 @@ func (s *Source) Run(ctx context.Context, sharedCtx *common.Shared, rootUrl stri extractedURL = strings.TrimPrefix(extractedURL, "25") extractedURL = strings.TrimPrefix(extractedURL, "2f") - results <- source.Result{Source: s.Name(), Value: extractedURL} + results <- source.Result{Source: s.Name(), Value: extractedURL, Reference: searchURL} } } diff --git a/pkg/navigation/response.go b/pkg/navigation/response.go index cb6f0ef2..a9dd2b7e 100644 --- a/pkg/navigation/response.go +++ b/pkg/navigation/response.go @@ -42,6 +42,11 @@ type Response struct { StoredResponsePath string `json:"stored_response_path,omitempty"` } +type PassiveReference struct { + Source string `json:"source"` + Reference string `json:"reference"` +} + func (n Response) AbsoluteURL(path string) string { if strings.HasPrefix(path, "#") { return "" diff --git a/pkg/output/result.go b/pkg/output/result.go index 9c93922d..90d04c3b 100644 --- a/pkg/output/result.go +++ b/pkg/output/result.go @@ -8,10 +8,11 @@ import ( // Result of the crawling type Result struct { - Timestamp time.Time `json:"timestamp,omitempty"` - Request *navigation.Request `json:"request,omitempty"` - Response *navigation.Response `json:"response,omitempty"` - Error string `json:"error,omitempty"` + Timestamp time.Time `json:"timestamp,omitempty"` + Request *navigation.Request `json:"request,omitempty"` + Response *navigation.Response `json:"response,omitempty"` + PassiveReference *navigation.PassiveReference `json:"passive,omitempty"` + Error string `json:"error,omitempty"` } // HasResponse checks if the result has a valid response From 229a1d6a026d1308159467d612f9877618a2686b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Sun, 3 Mar 2024 18:18:35 +0300 Subject: [PATCH 08/15] validate CLI flags: can't be used with headless --- internal/runner/options.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/runner/options.go b/internal/runner/options.go index 7ce784f5..d6380f55 100644 --- a/internal/runner/options.go +++ b/internal/runner/options.go @@ -26,6 +26,11 @@ func validateOptions(options *types.Options) error { if len(options.URLs) == 0 && !fileutil.HasStdin() { return errorutil.New("no inputs specified for crawler") } + + if options.Headless && options.Passive { + return errorutil.New("headless mode (-headless) and passive mode (-passive) cannot be used together") + } + if (options.HeadlessOptionalArguments != nil || options.HeadlessNoSandbox || options.SystemChromePath != "") && !options.Headless { return errorutil.New("headless mode (-hl) is required if -ho, -nos or -scp are set") } From a367c7b2870de72130c7f625e9506659a0102741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Sun, 3 Mar 2024 18:29:27 +0300 Subject: [PATCH 09/15] duplicate URL check --- pkg/engine/passive/passive.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/engine/passive/passive.go b/pkg/engine/passive/passive.go index 93be53f1..3d7c5d2d 100644 --- a/pkg/engine/passive/passive.go +++ b/pkg/engine/passive/passive.go @@ -79,7 +79,12 @@ func (c *Crawler) Crawl(rootURL string) error { wg.Wait() }() + seenURLs := make(map[string]struct{}) for result := range results { + if _, found := seenURLs[result.Value]; found { + continue + } + if !utils.IsURL(result.Value) { gologger.Debug().Msgf("`%v` not a url. skipping", result.Value) continue @@ -90,6 +95,7 @@ func (c *Crawler) Crawl(rootURL string) error { continue } + seenURLs[result.Value] = struct{}{} req := &navigation.Request{Method: "GET", URL: result.Value} passiveReference := &navigation.PassiveReference{Source: result.Source, Reference: result.Reference} c.Output(req, nil, passiveReference, nil) From 0fbe67086164453722dc7603ad087ac6d57a9c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Sun, 3 Mar 2024 18:53:18 +0300 Subject: [PATCH 10/15] format CLI output --- pkg/engine/passive/passive.go | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pkg/engine/passive/passive.go b/pkg/engine/passive/passive.go index 3d7c5d2d..150062ab 100644 --- a/pkg/engine/passive/passive.go +++ b/pkg/engine/passive/passive.go @@ -5,6 +5,7 @@ import ( "fmt" "strings" "sync" + "time" "github.com/projectdiscovery/gologger" "github.com/projectdiscovery/katana/pkg/engine/common" @@ -60,10 +61,16 @@ func (c *Crawler) Close() error { // Crawl crawls a URL with the specified options func (c *Crawler) Crawl(rootURL string) error { + gologger.Info().Msgf("Enumerating passive endpoints for %s", rootURL) + rootUrlParsed, _ := urlutil.ParseURL(rootURL, true) results := make(chan source.Result) + var timeTaken time.Duration go func() { - defer close(results) + defer func(startTime time.Time) { + timeTaken = time.Since(startTime) + close(results) + }(time.Now()) ctx := context.Background() wg := &sync.WaitGroup{} @@ -80,6 +87,7 @@ func (c *Crawler) Crawl(rootURL string) error { }() seenURLs := make(map[string]struct{}) + sourceStats := make(map[string]int) for result := range results { if _, found := seenURLs[result.Value]; found { continue @@ -96,9 +104,18 @@ func (c *Crawler) Crawl(rootURL string) error { } seenURLs[result.Value] = struct{}{} + sourceStats[result.Source]++ + req := &navigation.Request{Method: "GET", URL: result.Value} passiveReference := &navigation.PassiveReference{Source: result.Source, Reference: result.Reference} c.Output(req, nil, passiveReference, nil) } + + var stats []string + for source, count := range sourceStats { + stats = append(stats, fmt.Sprintf("%s: %d", source, count)) + } + + gologger.Info().Msgf("Found %d endpoints for %s in %s (%s)", len(seenURLs), rootURL, timeTaken.String(), strings.Join(stats, ", ")) return nil } From 5ae1d6a073e8bf87a1b18355319e8a515466cefc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Mon, 11 Mar 2024 11:37:50 +0300 Subject: [PATCH 11/15] refactor extractor pkg --- pkg/engine/passive/extractor/extractor.go | 6 ---- .../passive/extractor/regex_extractor.go | 29 ------------------- pkg/engine/passive/regexp/regexp.go | 16 ++++++++++ .../passive/source/commoncrawl/commoncrawl.go | 9 +++--- .../source/waybackarchive/waybackarchive.go | 5 ++-- 5 files changed, 22 insertions(+), 43 deletions(-) delete mode 100644 pkg/engine/passive/extractor/extractor.go delete mode 100644 pkg/engine/passive/extractor/regex_extractor.go create mode 100644 pkg/engine/passive/regexp/regexp.go diff --git a/pkg/engine/passive/extractor/extractor.go b/pkg/engine/passive/extractor/extractor.go deleted file mode 100644 index 46610eb0..00000000 --- a/pkg/engine/passive/extractor/extractor.go +++ /dev/null @@ -1,6 +0,0 @@ -package extractor - -// UrlExtractor is an interface that defines the contract for domain extraction. -type UrlExtractor interface { - Extract(text string) []string -} diff --git a/pkg/engine/passive/extractor/regex_extractor.go b/pkg/engine/passive/extractor/regex_extractor.go deleted file mode 100644 index c00a9221..00000000 --- a/pkg/engine/passive/extractor/regex_extractor.go +++ /dev/null @@ -1,29 +0,0 @@ -package extractor - -import ( - "regexp" - "strings" -) - -// RegexUrlExtractor is a concrete implementation of the UrlExtractor interface, using regex for extraction. -type RegexUrlExtractor struct { - extractor *regexp.Regexp -} - -// NewRegexUrlExtractor creates a new regular expression to extract urls -func NewRegexUrlExtractor() (*RegexUrlExtractor, error) { - extractor, err := regexp.Compile(`(?:http|https)?://(?:www\.)?[a-zA-Z0-9./?=_%:-]*`) - if err != nil { - return nil, err - } - return &RegexUrlExtractor{extractor: extractor}, nil -} - -// Extract implements the UrlExtractor interface, using the regex to find urls in the given text. -func (re *RegexUrlExtractor) Extract(text string) []string { - matches := re.extractor.FindAllString(text, -1) - for i, match := range matches { - matches[i] = strings.ToLower(match) - } - return matches -} diff --git a/pkg/engine/passive/regexp/regexp.go b/pkg/engine/passive/regexp/regexp.go new file mode 100644 index 00000000..785d7f90 --- /dev/null +++ b/pkg/engine/passive/regexp/regexp.go @@ -0,0 +1,16 @@ +package regexp + +import ( + "regexp" + "strings" +) + +var re, _ = regexp.Compile(`(?:http|https)?://(?:www\.)?[a-zA-Z0-9./?=_%:-]*`) + +func Extract(text string) []string { + matches := re.FindAllString(text, -1) + for i, match := range matches { + matches[i] = strings.ToLower(match) + } + return matches +} diff --git a/pkg/engine/passive/source/commoncrawl/commoncrawl.go b/pkg/engine/passive/source/commoncrawl/commoncrawl.go index 3c072c60..9ea8a8dc 100644 --- a/pkg/engine/passive/source/commoncrawl/commoncrawl.go +++ b/pkg/engine/passive/source/commoncrawl/commoncrawl.go @@ -13,8 +13,8 @@ import ( jsoniter "github.com/json-iterator/go" "github.com/projectdiscovery/katana/pkg/engine/common" - "github.com/projectdiscovery/katana/pkg/engine/passive/extractor" "github.com/projectdiscovery/katana/pkg/engine/passive/httpclient" + "github.com/projectdiscovery/katana/pkg/engine/passive/regexp" "github.com/projectdiscovery/katana/pkg/engine/passive/source" ) @@ -73,9 +73,8 @@ func (s *Source) Run(ctx context.Context, sharedCtx *common.Shared, rootUrl stri } } - urlExtractor, _ := extractor.NewRegexUrlExtractor() for _, apiURL := range searchIndexes { - further := s.getURLs(ctx, apiURL, rootUrl, httpClient, urlExtractor, results) + further := s.getURLs(ctx, apiURL, rootUrl, httpClient, results) if !further { break } @@ -97,7 +96,7 @@ func (s *Source) AddApiKeys(_ []string) { // no key needed } -func (s *Source) getURLs(ctx context.Context, searchURL, rootURL string, httpClient *httpclient.HttpClient, urlExtractor *extractor.RegexUrlExtractor, results chan source.Result) bool { +func (s *Source) getURLs(ctx context.Context, searchURL, rootURL string, httpClient *httpclient.HttpClient, results chan source.Result) bool { for { select { case <-ctx.Done(): @@ -120,7 +119,7 @@ func (s *Source) getURLs(ctx context.Context, searchURL, rootURL string, httpCli continue } line, _ = url.QueryUnescape(line) - for _, extractedURL := range urlExtractor.Extract(line) { + for _, extractedURL := range regexp.Extract(line) { // fix for triple encoded URL extractedURL = strings.ToLower(extractedURL) extractedURL = strings.TrimPrefix(extractedURL, "25") diff --git a/pkg/engine/passive/source/waybackarchive/waybackarchive.go b/pkg/engine/passive/source/waybackarchive/waybackarchive.go index 5fa1c860..db023d75 100644 --- a/pkg/engine/passive/source/waybackarchive/waybackarchive.go +++ b/pkg/engine/passive/source/waybackarchive/waybackarchive.go @@ -8,8 +8,8 @@ import ( "strings" "github.com/projectdiscovery/katana/pkg/engine/common" - "github.com/projectdiscovery/katana/pkg/engine/passive/extractor" "github.com/projectdiscovery/katana/pkg/engine/passive/httpclient" + "github.com/projectdiscovery/katana/pkg/engine/passive/regexp" "github.com/projectdiscovery/katana/pkg/engine/passive/source" ) @@ -31,14 +31,13 @@ func (s *Source) Run(ctx context.Context, sharedCtx *common.Shared, rootUrl stri defer resp.Body.Close() scanner := bufio.NewScanner(resp.Body) - urlExtractor, _ := extractor.NewRegexUrlExtractor() for scanner.Scan() { line := scanner.Text() if line == "" { continue } line, _ = url.QueryUnescape(line) - for _, extractedURL := range urlExtractor.Extract(line) { + for _, extractedURL := range regexp.Extract(line) { // fix for triple encoded URL extractedURL = strings.ToLower(extractedURL) extractedURL = strings.TrimPrefix(extractedURL, "25") From e73aee60454e50d0bceb38ef609b798aae0896ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Mon, 11 Mar 2024 13:58:42 +0300 Subject: [PATCH 12/15] add response --- pkg/engine/passive/passive.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/engine/passive/passive.go b/pkg/engine/passive/passive.go index 150062ab..ac5e1aa1 100644 --- a/pkg/engine/passive/passive.go +++ b/pkg/engine/passive/passive.go @@ -3,6 +3,7 @@ package passive import ( "context" "fmt" + "net/http" "strings" "sync" "time" @@ -106,9 +107,12 @@ func (c *Crawler) Crawl(rootURL string) error { seenURLs[result.Value] = struct{}{} sourceStats[result.Source]++ + passiveURL, _ := urlutil.Parse(result.Value) req := &navigation.Request{Method: "GET", URL: result.Value} + resp := &navigation.Response{StatusCode: 200, RootHostname: passiveURL.Hostname(), + Resp: &http.Response{StatusCode: 200, Request: &http.Request{Method: "GET", URL: passiveURL.URL}}} passiveReference := &navigation.PassiveReference{Source: result.Source, Reference: result.Reference} - c.Output(req, nil, passiveReference, nil) + c.Output(req, resp, passiveReference, nil) } var stats []string From b9ebdeed8cd152825c319296ef65ec8840333d8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Mon, 11 Mar 2024 14:04:22 +0300 Subject: [PATCH 13/15] fix go.mod --- go.mod | 1 + go.sum | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index e0d7e347..91a93a8d 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,7 @@ require ( github.com/projectdiscovery/mapcidr v1.1.16 github.com/projectdiscovery/ratelimit v0.0.30 github.com/projectdiscovery/retryablehttp-go v1.0.51 + github.com/projectdiscovery/useragent v0.0.41 github.com/projectdiscovery/utils v0.0.83 github.com/projectdiscovery/wappalyzergo v0.0.109 github.com/remeh/sizedwaitgroup v1.0.0 diff --git a/go.sum b/go.sum index 7c170d37..761c21b2 100644 --- a/go.sum +++ b/go.sum @@ -227,8 +227,8 @@ github.com/projectdiscovery/retryablehttp-go v1.0.51 h1:8XMrNC8JrwvySESe2d+XWF9b github.com/projectdiscovery/retryablehttp-go v1.0.51/go.mod h1:6cdh/acYHpeYWg7+Iblh4xBRb87bC118L4G4mpvCMuA= github.com/projectdiscovery/stringsutil v0.0.2 h1:uzmw3IVLJSMW1kEg8eCStG/cGbYYZAja8BH3LqqJXMA= github.com/projectdiscovery/stringsutil v0.0.2/go.mod h1:EJ3w6bC5fBYjVou6ryzodQq37D5c6qbAYQpGmAy+DC0= -github.com/projectdiscovery/useragent v0.0.39 h1:s2jyXdtjVo0MfYYkifx7irrOIoA0JhzhZaBkpcoWgV4= -github.com/projectdiscovery/useragent v0.0.39/go.mod h1:wO6GQImJ2IQ5K+GDggS/Rhg6IV9Z2Du6NbqC/um0g0w= +github.com/projectdiscovery/useragent v0.0.41 h1:GWHPIArnz6/rKpfbqlP484QmHiOFERH0tewvmAh1MHE= +github.com/projectdiscovery/useragent v0.0.41/go.mod h1:oXjattkrFK9Y/8c+9/6aBkAA307L/NWQrs28uJaE9ow= github.com/projectdiscovery/utils v0.0.83 h1:r7OBAuEwe4lyEwTITbCEZytoxvjk/s0Xra2NT+K4fm4= github.com/projectdiscovery/utils v0.0.83/go.mod h1:2XFoaGD5NPUp6liTRHC2tGmMQnIhQSXscpP3zfAG7iE= github.com/projectdiscovery/wappalyzergo v0.0.109 h1:BERfwTRn1dvB1tbhyc5m67R8VkC9zbVuPsEq4VEm07k= From 2b341b6ae1bcfced1b1de390704af2a699169db0 Mon Sep 17 00:00:00 2001 From: mzack Date: Wed, 13 Mar 2024 19:54:55 +0100 Subject: [PATCH 14/15] minor changes --- pkg/engine/passive/passive.go | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/pkg/engine/passive/passive.go b/pkg/engine/passive/passive.go index ac5e1aa1..ebac621b 100644 --- a/pkg/engine/passive/passive.go +++ b/pkg/engine/passive/passive.go @@ -108,10 +108,25 @@ func (c *Crawler) Crawl(rootURL string) error { sourceStats[result.Source]++ passiveURL, _ := urlutil.Parse(result.Value) - req := &navigation.Request{Method: "GET", URL: result.Value} - resp := &navigation.Response{StatusCode: 200, RootHostname: passiveURL.Hostname(), - Resp: &http.Response{StatusCode: 200, Request: &http.Request{Method: "GET", URL: passiveURL.URL}}} - passiveReference := &navigation.PassiveReference{Source: result.Source, Reference: result.Reference} + req := &navigation.Request{ + Method: http.MethodGet, + URL: result.Value, + } + resp := &navigation.Response{ + StatusCode: http.StatusOK, + RootHostname: passiveURL.Hostname(), + Resp: &http.Response{ + StatusCode: http.StatusOK, + Request: &http.Request{ + Method: http.MethodGet, + URL: passiveURL.URL, + }, + }, + } + passiveReference := &navigation.PassiveReference{ + Source: result.Source, + Reference: result.Reference, + } c.Output(req, resp, passiveReference, nil) } From 2f78c8460ed482abf2621f79d5e8108510041a14 Mon Sep 17 00:00:00 2001 From: sandeep <8293321+ehsandeep@users.noreply.github.com> Date: Wed, 20 Mar 2024 22:45:03 +0530 Subject: [PATCH 15/15] misc update --- README.md | 7 ++++++- cmd/katana/main.go | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a028cee6..9bef2334 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,8 @@ ![image](https://user-images.githubusercontent.com/8293321/199371558-daba03b6-bf9c-4883-8506-76497c6c3a44.png) - Fast And fully configurable web crawling - - **Standard** and **Headless** mode support + - **Standard** and **Headless** mode + - **Active** and **Passive** mode - **JavaScript** parsing / crawling - Customizable **automatic form filling** - **Scope control** - Preconfigured field / Regex @@ -155,6 +156,10 @@ HEADLESS: -cwu, -chrome-ws-url string use chrome browser instance launched elsewhere with the debugger listening at this URL -xhr, -xhr-extraction extract xhr request url,method in jsonl output +PASSIVE: + -ps, -passive enable passive sources to discover target endpoints + -pss, -passive-source string[] passive source to use for url discovery (waybackarchive,commoncrawl,alienvault) + SCOPE: -cs, -crawl-scope string[] in scope url regex to be followed by crawler -cos, -crawl-out-scope string[] out of scope url regex to be excluded by crawler diff --git a/cmd/katana/main.go b/cmd/katana/main.go index bc5f6f0b..0d4ac4e7 100644 --- a/cmd/katana/main.go +++ b/cmd/katana/main.go @@ -128,7 +128,7 @@ pipelines offering both headless and non-headless crawling.`) ) flagSet.CreateGroup("passive", "Passive", flagSet.BoolVarP(&options.Passive, "passive", "ps", false, "enable passive sources to discover target endpoints"), - flagSet.StringSliceVarP(&options.PassiveSource, "passive-source", "pss", nil, "passive source to use for url discovery (wayback,urlscan,commoncrawl,virustotal,alienvault)", goflags.NormalizedStringSliceOptions), + flagSet.StringSliceVarP(&options.PassiveSource, "passive-source", "pss", nil, "passive source to use for url discovery (waybackarchive,commoncrawl,alienvault)", goflags.NormalizedStringSliceOptions), ) flagSet.CreateGroup("scope", "Scope",