diff --git a/cmd/walker/config-bestbytes.yaml b/cmd/walker/config-bestbytes.yaml deleted file mode 100644 index 235055e..0000000 --- a/cmd/walker/config-bestbytes.yaml +++ /dev/null @@ -1,5 +0,0 @@ -target: http://www.bestbytes.de -concurrency: 2 -addr: ":3001" -ignore: - - /foomo diff --git a/extractlinks.go b/filterscrapelinks.go similarity index 91% rename from extractlinks.go rename to filterscrapelinks.go index 1106a79..0e78c13 100644 --- a/extractlinks.go +++ b/filterscrapelinks.go @@ -7,6 +7,15 @@ import ( "github.com/temoto/robotstxt" ) +type linkLimitations struct { + depth int + paging bool + ignoreAllQueries bool + ignorePathPrefixes []string + includePathPrefixes []string + ignoreQueriesWith []string +} + func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, err error) { // let us ditch anchors anchorParts := strings.Split(linkURL, "#") @@ -32,7 +41,7 @@ func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, e return } -func extractLinksToScrape( +func filterScrapeLinks( linkList LinkList, baseURL *url.URL, linkNextNormalized string, diff --git a/go.mod b/go.mod index 2a60038..f5610e5 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/davecgh/go-spew v1.1.1 github.com/prometheus/client_golang v1.2.1 github.com/stretchr/testify v1.3.0 + github.com/temoto/robotstxt v1.1.1 gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0 gopkg.in/yaml.v2 v2.2.2 ) diff --git a/go.sum b/go.sum index 2958a74..0c0d8cd 100644 --- a/go.sum +++ b/go.sum @@ -66,6 +66,8 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA= +github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= diff --git a/robotstxt.go b/robotstxt.go new file mode 100644 index 0000000..f96ff7e --- /dev/null +++ b/robotstxt.go @@ -0,0 +1,19 @@ +package walker + +import ( + "net/http" + + "github.com/temoto/robotstxt" +) + +func getRobotsData(baseURL string) (data *robotstxt.RobotsData, err error) { + resp, errGet := http.Get(baseURL + "/robots.txt") + if errGet != nil { + return nil, errGet + } + data, errFromResponse := robotstxt.FromResponse(resp) + if errFromResponse != nil { + return nil, errFromResponse + } + return data, nil +} diff --git a/scrape.go b/scrape.go index 6fb2f81..e680c2e 100644 --- a/scrape.go +++ b/scrape.go @@ -50,7 +50,7 @@ func Scrape(pc *poolClient, targetURL string, groupHeader string, chanResult cha chanResult <- result return } - req.Header.Set("User-Agent", "foomo-walker") + req.Header.Set("User-Agent", pc.agent) resp, errGet := pc.client.Do(req) if errGet != nil { diff --git a/scrapeloop.go b/scrapeloop.go index d257e8a..b44c95e 100644 --- a/scrapeloop.go +++ b/scrapeloop.go @@ -1,6 +1,7 @@ package walker import ( + "errors" "fmt" "net" "net/http" @@ -10,85 +11,25 @@ import ( "strings" "time" - "github.com/prometheus/client_golang/prometheus" + "github.com/temoto/robotstxt" ) type poolClient struct { + agent string client *http.Client busy bool } -func (w *Walker) scrapeloop() { - running := 0 - depth := 0 - paging := false - groupHeader := "" - ignoreAllQueries := false - ignoreRobots := false - var jobs map[string]bool - var results map[string]ScrapeResult - var ignore []string - var ignoreQueriesWith []string - var baseURL *url.URL - paths := []string{} - clientPool := []*poolClient{} - getBucketList() +type clientPool struct { + agent string + concurrency int + useCookies bool + clients []*poolClient +} - const prometheusLabelGroup = "group" - const prometheusLabelStatus = "status" - - summaryVec := prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Name: "walker_scrape_durations_seconds", - Help: "scrape duration whole request time including streaming of body", - Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, - }, - []string{prometheusLabelGroup}, - ) - - counterVec := prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "walker_scrape_running_total", - Help: "Number of scrapes in scan.", - }, - []string{prometheusLabelGroup, prometheusLabelStatus}, - ) - - totalCounter := prometheus.NewCounter(prometheus.CounterOpts{ - Name: "walker_scrape_counter_total", - Help: "number of scrapes since start of walker", - }) - - progressGaugeOpen := prometheus.NewGauge( - prometheus.GaugeOpts{ - Name: "walker_progress_gauge_open", - Help: "progress open to scrape", - }, - ) - - progressGaugeComplete := prometheus.NewGauge( - prometheus.GaugeOpts{ - Name: "walker_progress_gauge_complete", - Help: "progress complete scrapes", - }, - ) - - counterVecStatus := prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "walker_progress_status_code_total", - Help: "status codes for running scrape", - }, []string{prometheusLabelStatus}) - - prometheus.MustRegister( - summaryVec, - counterVec, - totalCounter, - progressGaugeComplete, - progressGaugeOpen, - counterVecStatus, - ) - - clientPool = make([]*poolClient, w.concurrency) - for i := 0; i < w.concurrency; i++ { +func newClientPool(concurrency int, agent string, useCookies bool) *clientPool { + clients := make([]*poolClient, concurrency) + for i := 0; i < concurrency; i++ { client := &http.Client{ Timeout: time.Second * 10, Transport: &http.Transport{ @@ -98,17 +39,41 @@ func (w *Walker) scrapeloop() { TLSHandshakeTimeout: 5 * time.Second, }, } - if w.useCookies { + if useCookies { cookieJar, _ := cookiejar.New(nil) client.Jar = cookieJar } - clientPool[i] = &poolClient{ + clients[i] = &poolClient{ client: client, busy: false, + agent: agent, } } + return &clientPool{ + agent: agent, + concurrency: concurrency, + clients: clients, + useCookies: useCookies, + } +} - start := func(startURL *url.URL, configPaths []string) { +func (w *Walker) scrapeloop() { + summaryVec, counterVec, totalCounter, progressGaugeOpen, progressGaugeComplete, counterVecStatus := setupMetrics() + running := 0 + concurrency := 0 + groupHeader := "" + ignoreRobots := false + started := false + ll := linkLimitations{} + var jobs map[string]bool + var results map[string]ScrapeResult + var baseURL *url.URL + paths := []string{} + var cp *clientPool + var robotsGroup *robotstxt.Group + + restart := func(startURL *url.URL, configPaths []string) { + started = false summaryVec.Reset() counterVec.Reset() counterVecStatus.Reset() @@ -129,36 +94,36 @@ func (w *Walker) scrapeloop() { jobs = map[string]bool{baseURLString + p + q: false} } results = map[string]ScrapeResult{} - + started = true } - for { - progressGaugeComplete.Set(float64(len(results))) - progressGaugeOpen.Set(float64(len(jobs))) - if len(jobs) > 0 { - JobLoop: - for jobURL, jobActive := range jobs { - if running >= w.concurrency { - // concurrency limit - break - } - if !jobActive { - for _, poolClient := range clientPool { - if !poolClient.busy { - running++ - jobs[jobURL] = true - poolClient.busy = true - // u, _ := url.Parse(jobURL) - // fmt.Println("got pool client", i, poolClient.client.Jar.Cookies(u)) - go Scrape(poolClient, jobURL, groupHeader, w.chanResult) - continue JobLoop - } + for { + if started { + progressGaugeComplete.Set(float64(len(results))) + progressGaugeOpen.Set(float64(len(jobs))) + if len(jobs) > 0 { + JobLoop: + for jobURL, jobActive := range jobs { + if running >= concurrency { + // concurrency limit + break + } + if !jobActive { + for _, poolClient := range cp.clients { + if !poolClient.busy { + running++ + jobs[jobURL] = true + poolClient.busy = true + go Scrape(poolClient, jobURL, groupHeader, w.chanResult) + continue JobLoop + } + } + break JobLoop } - // fmt.Println("all clients are busy") - break JobLoop } } } + // time to restart if results != nil && len(jobs) == 0 && running == 0 && baseURL != nil { fmt.Println("restarting", baseURL, paths) @@ -166,27 +131,57 @@ func (w *Walker) scrapeloop() { Results: results, Jobs: jobs, } - start(baseURL, paths) + restart(baseURL, paths) } select { case <-time.After(time.Millisecond * 1000): // make sure we do not get stuck case st := <-w.chanStart: + robotsGroup = nil groupHeader = st.conf.GroupHeader - ignore = st.conf.Ignore - depth = st.conf.Depth - paging = st.conf.Paging + concurrency = st.conf.Concurrency + ll.ignorePathPrefixes = st.conf.Ignore + ll.depth = st.conf.Depth + ll.paging = st.conf.Paging + ll.includePathPrefixes = st.conf.Target.Paths ignoreRobots = st.conf.IgnoreRobots - ignoreQueriesWith = st.conf.IgnoreQueriesWith - ignoreAllQueries = st.conf.IgnoreAllQueries + ll.ignoreQueriesWith = st.conf.IgnoreQueriesWith + ll.ignoreAllQueries = st.conf.IgnoreAllQueries + + if cp == nil || cp.agent != st.conf.Agent || cp.concurrency != st.conf.Concurrency || cp.useCookies != st.conf.UseCookies { + cp = newClientPool(st.conf.Concurrency, st.conf.Agent, st.conf.UseCookies) + } + + var errStart error startU, errParseStartU := url.Parse(st.conf.Target.BaseURL) - if errParseStartU == nil { - start(startU, st.conf.Target.Paths) + if errParseStartU != nil { + errStart = errParseStartU + } + if errStart == nil && !ignoreRobots { + robotsData, errRobotsGroup := getRobotsData(st.conf.Target.BaseURL) + if errRobotsGroup == nil { + robotsGroup = robotsData.FindGroup(st.conf.Agent) + robotForbiddenPath := []string{} + for _, p := range st.conf.Target.Paths { + if !robotsGroup.Test(p) { + robotForbiddenPath = append(robotForbiddenPath, p) + } + } + if len(robotForbiddenPath) > 0 { + errStart = errors.New("robots.txt does not allow access to the following path (you can either ignore robots or try as a different user agent): " + strings.Join(robotForbiddenPath, ", ")) + } + } else { + errStart = errRobotsGroup + } + } + if errStart == nil { + restart(startU, st.conf.Target.Paths) w.chanErrStart <- nil } else { - w.chanErrStart <- errParseStartU + w.chanErrStart <- errStart } + case <-w.chanStatus: resultsCopy := make(map[string]ScrapeResult, len(results)) jobsCopy := make(map[string]bool, len(jobs)) @@ -246,110 +241,31 @@ func (w *Walker) scrapeloop() { totalCounter.Inc() if ignoreRobots || !strings.Contains(scanResult.Structure.Robots, "nofollow") { + linkNextNormalized := "" + linkPrevNormalized := "" // should we follow the links - for linkURL := range scanResult.Links { + linkNextNormalizedURL, errNormalizeNext := normalizeLink(baseURL, scanResult.Structure.LinkNext) + if errNormalizeNext == nil { + linkNextNormalized = linkNextNormalizedURL.String() + } + linkPrevNormalizedURL, errNormalizedPrev := normalizeLink(baseURL, scanResult.Structure.LinkPrev) + if errNormalizedPrev == nil { + linkPrevNormalized = linkPrevNormalizedURL.String() + } - // is it a pager link - // this might want to be normalized - isPagerLink := scanResult.Structure.LinkNext == linkURL || scanResult.Structure.LinkPrev == linkURL - if !paging && isPagerLink { - continue - } - - // ok, time to really look at that url - linkU, errParseLinkU := normalizeLink(baseURL, linkURL) - if errParseLinkU == nil { - - // to be ignored ?! - ignoreLink := false - - if len(linkU.Query()) > 0 { - // it has a query - if ignoreAllQueries { - // no queries in general - ignoreLink = true - } else { - // do we filter a query parameter - IgnoreLoop: - for _, ignoreP := range ignoreQueriesWith { - for pName := range linkU.Query() { - if pName == ignoreP { - ignoreLink = true - break IgnoreLoop - } - } - } - } - } - if !ignoreLink { - foundPath := false - for _, p := range paths { - if strings.HasPrefix(linkU.Path, p) { - foundPath = true - break - } - } - if !foundPath { - // not in the scrape path - ignoreLink = true - } - } - if !ignoreLink && depth > 0 { - // too deep? - linkDepth := len(strings.Split(linkU.Path, "/")) - 1 - ignoreLink = linkDepth > depth - if ignoreLink { - fmt.Println("ignoring", linkU.Path, depth, linkDepth) - } - } - // ignore prefix - if !ignoreLink { - for _, ignorePrefix := range ignore { - if strings.HasPrefix(linkU.Path, ignorePrefix) { - ignoreLink = true - break - } - } - } - - if !ignoreLink && linkU.Host == baseURL.Host && - linkU.Scheme == baseURL.Scheme { - scanResult.Links[linkU.String()] = scanResult.Links[linkURL] - linkURL = linkU.String() - _, existingResultOK := results[linkURL] - _, existingJobOK := jobs[linkURL] - if !existingResultOK && !existingJobOK { - jobs[linkURL] = false - } - } + linksToScrape := filterScrapeLinks(scanResult.Links, baseURL, linkNextNormalized, linkPrevNormalized, ll, robotsGroup) + for linkToScrape := range linksToScrape { + // scanResult.Links[linkU.String()] = scanResult.Links[linkURL] + // linkURL = linkU.String() + _, existingResultOK := results[linkToScrape] + _, existingJobOK := jobs[linkToScrape] + if !existingResultOK && !existingJobOK { + jobs[linkToScrape] = false } } + } + } } } - -func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, err error) { - // let us ditch anchors - anchorParts := strings.Split(linkURL, "#") - linkURL = anchorParts[0] - link, errParseLink := url.Parse(linkURL) - if errParseLink != nil { - err = errParseLink - return - } - // host - if link.Host == "" { - link.Host = baseURL.Host - } - // scheme - if link.Scheme == "" || link.Scheme == "//" { - link.Scheme = baseURL.Scheme - } - if baseURL.User != nil { - link.User = baseURL.User - } - // it is beautiful now - normalizedLink = link - return -} diff --git a/service.go b/service.go index 10ed19c..450522f 100644 --- a/service.go +++ b/service.go @@ -15,8 +15,11 @@ type Service struct { } func NewService(conf *config.Config) (s *Service, err error) { - w := NewWalker(conf.Concurrency, conf.UseCookies) - w.walk(conf) + w := NewWalker() + errWalk := w.walk(conf) + if errWalk != nil { + return nil, errWalk + } s = &Service{ Walker: w, // targetURL: conf.Target,