Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

introduce passive crawling #781

Merged
merged 18 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/release-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:

# todo: musl compatible?
- name: Install Dependences
run: sudo apt install gcc-aarch64-linux-gnu
run: sudo apt update && sudo apt install gcc-aarch64-linux-gnu

- name: release test
uses: goreleaser/goreleaser-action@v4
Expand Down
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
![image](https://user-images.githubusercontent.com/8293321/199371558-daba03b6-bf9c-4883-8506-76497c6c3a44.png)

- Fast And fully configurable web crawling
- **Standard** and **Headless** mode support
- **Standard** and **Headless** mode
- **Active** and **Passive** mode
- **JavaScript** parsing / crawling
- Customizable **automatic form filling**
- **Scope control** - Preconfigured field / Regex
Expand Down Expand Up @@ -155,6 +156,10 @@ HEADLESS:
-cwu, -chrome-ws-url string use chrome browser instance launched elsewhere with the debugger listening at this URL
-xhr, -xhr-extraction extract xhr request url,method in jsonl output

PASSIVE:
-ps, -passive enable passive sources to discover target endpoints
-pss, -passive-source string[] passive source to use for url discovery (waybackarchive,commoncrawl,alienvault)

SCOPE:
-cs, -crawl-scope string[] in scope url regex to be followed by crawler
-cos, -crawl-out-scope string[] out of scope url regex to be excluded by crawler
Expand Down
4 changes: 4 additions & 0 deletions cmd/katana/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ pipelines offering both headless and non-headless crawling.`)
flagSet.StringVarP(&options.ChromeWSUrl, "chrome-ws-url", "cwu", "", "use chrome browser instance launched elsewhere with the debugger listening at this URL"),
flagSet.BoolVarP(&options.XhrExtraction, "xhr-extraction", "xhr", false, "extract xhr request url,method in jsonl output"),
)
flagSet.CreateGroup("passive", "Passive",
flagSet.BoolVarP(&options.Passive, "passive", "ps", false, "enable passive sources to discover target endpoints"),
flagSet.StringSliceVarP(&options.PassiveSource, "passive-source", "pss", nil, "passive source to use for url discovery (waybackarchive,commoncrawl,alienvault)", goflags.NormalizedStringSliceOptions),
)

flagSet.CreateGroup("scope", "Scope",
flagSet.StringSliceVarP(&options.Scope, "crawl-scope", "cs", nil, "in scope url regex to be followed by crawler", goflags.FileCommaSeparatedStringSliceOptions),
Expand Down
4 changes: 4 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ require (
github.com/projectdiscovery/mapcidr v1.1.16
github.com/projectdiscovery/ratelimit v0.0.33
github.com/projectdiscovery/retryablehttp-go v1.0.52
github.com/projectdiscovery/useragent v0.0.41
github.com/projectdiscovery/utils v0.0.83
github.com/projectdiscovery/wappalyzergo v0.0.113
github.com/remeh/sizedwaitgroup v1.0.0
Expand Down Expand Up @@ -55,6 +56,7 @@ require (
github.com/kataras/jwt v0.1.8 // indirect
github.com/klauspost/compress v1.16.7 // indirect
github.com/klauspost/pgzip v1.2.5 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
Expand All @@ -69,9 +71,11 @@ require (
github.com/projectdiscovery/blackrock v0.0.1 // indirect
github.com/projectdiscovery/gostruct v0.0.2 // indirect
github.com/projectdiscovery/machineid v0.0.0-20240226150047-2e2c51e35983 // indirect
github.com/projectdiscovery/stringsutil v0.0.2 // indirect
github.com/quic-go/quic-go v0.37.7 // indirect
github.com/refraction-networking/utls v1.5.4 // indirect
github.com/rivo/uniseg v0.4.4 // indirect
github.com/rogpeppe/go-internal v1.12.0 // indirect
github.com/sashabaranov/go-openai v1.14.2 // indirect
github.com/shoenig/go-m1cpu v0.1.6 // indirect
github.com/smacker/go-tree-sitter v0.0.0-20230720070738-0d0a9f78d8f8 // indirect
Expand Down
14 changes: 12 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vc
github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA=
github.com/cnf/structhash v0.0.0-20201127153200-e1b16c1ebc08 h1:ox2F0PSMlrAAiAdknSRMDrAr8mfxPCfSZolH+/qQnyQ=
github.com/cnf/structhash v0.0.0-20201127153200-e1b16c1ebc08/go.mod h1:pCxVEbcm3AMg7ejXyorUXi6HQCzOIBf7zEDVPtw0/U4=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down Expand Up @@ -126,8 +127,8 @@ github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE
github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
Expand Down Expand Up @@ -192,6 +193,7 @@ github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+q
github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk=
github.com/pierrec/lz4/v4 v4.1.2 h1:qvY3YFXRQE/XB8MlLzJH7mSzBs74eA2gg52YTk6jUPM=
github.com/pierrec/lz4/v4 v4.1.2/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
Expand Down Expand Up @@ -226,6 +228,10 @@ github.com/projectdiscovery/retryabledns v1.0.58 h1:ut1FSB9+GZ6zQIlKJFLqIz2RZs81
github.com/projectdiscovery/retryabledns v1.0.58/go.mod h1:RobmKoNBgngAVE4H9REQtaLP1pa4TCyypHy1MWHT1mY=
github.com/projectdiscovery/retryablehttp-go v1.0.52 h1:E1EXok2oXmX1pwCHMyMKkdbiyp0IUxd5bQ7ZbT8AK+o=
github.com/projectdiscovery/retryablehttp-go v1.0.52/go.mod h1:DITjQ0spJHSL81ALR6BEr+yMw/Nxhw0qSdjwF9mGhjI=
github.com/projectdiscovery/stringsutil v0.0.2 h1:uzmw3IVLJSMW1kEg8eCStG/cGbYYZAja8BH3LqqJXMA=
github.com/projectdiscovery/stringsutil v0.0.2/go.mod h1:EJ3w6bC5fBYjVou6ryzodQq37D5c6qbAYQpGmAy+DC0=
github.com/projectdiscovery/useragent v0.0.41 h1:GWHPIArnz6/rKpfbqlP484QmHiOFERH0tewvmAh1MHE=
github.com/projectdiscovery/useragent v0.0.41/go.mod h1:oXjattkrFK9Y/8c+9/6aBkAA307L/NWQrs28uJaE9ow=
github.com/projectdiscovery/utils v0.0.83 h1:r7OBAuEwe4lyEwTITbCEZytoxvjk/s0Xra2NT+K4fm4=
github.com/projectdiscovery/utils v0.0.83/go.mod h1:2XFoaGD5NPUp6liTRHC2tGmMQnIhQSXscpP3zfAG7iE=
github.com/projectdiscovery/wappalyzergo v0.0.113 h1:aoGOY3iGXX6U1RC2TAVEd/s65BESNYYIqpthZvcsZIk=
Expand All @@ -240,8 +246,12 @@ github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJ
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
github.com/rs/xid v1.5.0 h1:mKX4bl4iPYJtEIxp6CYiUuLQ/8DYMoz0PUdtGgMFRVc=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/sashabaranov/go-openai v1.14.2 h1:5DPTtR9JBjKPJS008/A409I5ntFhUPPGCmaAihcPRyo=
Expand Down
5 changes: 5 additions & 0 deletions internal/runner/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ func validateOptions(options *types.Options) error {
if len(options.URLs) == 0 && !fileutil.HasStdin() {
return errorutil.New("no inputs specified for crawler")
}

if options.Headless && options.Passive {
return errorutil.New("headless mode (-headless) and passive mode (-passive) cannot be used together")
}

if (options.HeadlessOptionalArguments != nil || options.HeadlessNoSandbox || options.SystemChromePath != "") && !options.Headless {
return errorutil.New("headless mode (-hl) is required if -ho, -nos or -scp are set")
}
Expand Down
3 changes: 3 additions & 0 deletions internal/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/projectdiscovery/katana/pkg/engine"
"github.com/projectdiscovery/katana/pkg/engine/hybrid"
"github.com/projectdiscovery/katana/pkg/engine/parser"
"github.com/projectdiscovery/katana/pkg/engine/passive"
"github.com/projectdiscovery/katana/pkg/engine/standard"
"github.com/projectdiscovery/katana/pkg/types"
"github.com/projectdiscovery/mapcidr"
Expand Down Expand Up @@ -98,6 +99,8 @@ func New(options *types.Options) (*Runner, error) {
switch {
case options.Headless:
crawler, err = hybrid.New(crawlerOptions)
case options.Passive:
crawler, err = passive.New(crawlerOptions)
default:
crawler, err = standard.New(crawlerOptions)
}
Expand Down
15 changes: 8 additions & 7 deletions pkg/engine/common/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func (s *Shared) Enqueue(queue *queue.Queue, navigationRequests ...*navigation.R
// if the user requested anyway out of scope items
// they are sent to output without visiting
if s.Options.Options.DisplayOutScope {
s.Output(nr, nil, ErrOutOfScope)
s.Output(nr, nil, nil, ErrOutOfScope)
}
continue
}
Expand All @@ -95,17 +95,18 @@ func (s *Shared) ValidateScope(URL string, root string) bool {
return err == nil && scopeValidated
}

func (s *Shared) Output(navigationRequest *navigation.Request, navigationResponse *navigation.Response, err error) {
func (s *Shared) Output(navigationRequest *navigation.Request, navigationResponse *navigation.Response, passiveReference *navigation.PassiveReference, err error) {
var errData string
if err != nil {
errData = err.Error()
}
// Write the found result to output
result := &output.Result{
Timestamp: time.Now(),
Request: navigationRequest,
Response: navigationResponse,
Error: errData,
Timestamp: time.Now(),
Request: navigationRequest,
Response: navigationResponse,
PassiveReference: passiveReference,
Error: errData,
}

outputErr := s.Options.OutputWriter.Write(result)
Expand Down Expand Up @@ -223,7 +224,7 @@ func (s *Shared) Do(crawlSession *CrawlSession, doRequest DoRequestFunc) error {

resp, err := doRequest(crawlSession, req)

s.Output(req, resp, err)
s.Output(req, resp, nil, err)

if err != nil {
gologger.Warning().Msgf("Could not request seed URL %s: %s\n", req.URL, err)
Expand Down
3 changes: 3 additions & 0 deletions pkg/engine/passive/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// Package passive implements the functionality for a non-headless crawler.
// It uses net/http for making requests and goquery for scraping web page HTML.
package passive
124 changes: 124 additions & 0 deletions pkg/engine/passive/httpclient/httpclient.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package httpclient

import (
"bytes"
"context"
"crypto/tls"
"fmt"
"io"
"net"
"net/http"
"net/url"
"time"

"github.com/projectdiscovery/gologger"
"github.com/projectdiscovery/useragent"
)

type HttpClient struct {
Client *http.Client
}

type BasicAuth struct {
Username string
Password string
}

func NewHttpClient(timeout int) *HttpClient {
Transport := &http.Transport{
MaxIdleConns: 100,
MaxIdleConnsPerHost: 100,
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
Dismissed Show dismissed Hide dismissed
},
Dial: (&net.Dialer{
Timeout: time.Duration(timeout) * time.Second,
}).Dial,
}

client := &http.Client{
Transport: Transport,
Timeout: time.Duration(timeout) * time.Second,
}

httpClient := &HttpClient{Client: client}

return httpClient
}

func (hc *HttpClient) Get(ctx context.Context, getURL, cookies string, headers map[string]string) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodGet, getURL, cookies, headers, nil, BasicAuth{})
}

func (hc *HttpClient) SimpleGet(ctx context.Context, getURL string) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodGet, getURL, "", map[string]string{}, nil, BasicAuth{})
}

func (hc *HttpClient) Post(ctx context.Context, postURL, cookies string, headers map[string]string, body io.Reader) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodPost, postURL, cookies, headers, body, BasicAuth{})
}

func (hc *HttpClient) SimplePost(ctx context.Context, postURL, contentType string, body io.Reader) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodPost, postURL, "", map[string]string{"Content-Type": contentType}, body, BasicAuth{})
}

func (hc *HttpClient) HTTPRequest(ctx context.Context, method, requestURL, cookies string, headers map[string]string, body io.Reader, basicAuth BasicAuth) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, method, requestURL, body)
if err != nil {
return nil, err
}

userAgent := useragent.PickRandom()
req.Header.Set("User-Agent", userAgent.String())
req.Header.Set("Accept", "*/*")
req.Header.Set("Accept-Language", "en")
req.Header.Set("Connection", "close")

if basicAuth.Username != "" || basicAuth.Password != "" {
req.SetBasicAuth(basicAuth.Username, basicAuth.Password)
}

if cookies != "" {
req.Header.Set("Cookie", cookies)
}

for key, value := range headers {
req.Header.Set(key, value)
}

return httpRequestWrapper(hc.Client, req)
}

func (hc *HttpClient) DiscardHTTPResponse(response *http.Response) {
if response != nil {
_, err := io.Copy(io.Discard, response.Body)
if err != nil {
gologger.Warning().Msgf("Could not discard response body: %s\n", err)
return
}
response.Body.Close()
}
}

func (hc *HttpClient) Close() {
hc.Client.CloseIdleConnections()
}

func httpRequestWrapper(client *http.Client, request *http.Request) (*http.Response, error) {
response, err := client.Do(request)
if err != nil {
return nil, err
}

if response.StatusCode != http.StatusOK {
requestURL, _ := url.QueryUnescape(request.URL.String())

gologger.Debug().MsgFunc(func() string {
buffer := new(bytes.Buffer)
_, _ = buffer.ReadFrom(response.Body)
return fmt.Sprintf("Response for failed request against %s:\n%s", requestURL, buffer.String())
})
return response, fmt.Errorf("unexpected status code %d received from %s", response.StatusCode, requestURL)
}
return response, nil
}
Loading
Loading