initial robots.txt support

2025-10-16 12:45:39 +00:00 · 2020-03-11 14:36:55 +01:00 · 2020-03-11 14:36:55 +01:00 · 91dc1fca07
commit 91dc1fca07
parent df5259b447
8 changed files with 160 additions and 215 deletions
--- a/cmd/walker/config-bestbytes.yaml
+++ b/cmd/walker/config-bestbytes.yaml
@ -1,5 +0,0 @@
 target: http://www.bestbytes.de
 concurrency: 2
 addr: ":3001"
 ignore:
  - /foomo
--- a/filterscrapelinks.go
+++ b/filterscrapelinks.go
@ -7,6 +7,15 @@ import (
 	"github.com/temoto/robotstxt"
 )
 type linkLimitations struct {
 	depth               int
 	paging              bool
 	ignoreAllQueries    bool
 	ignorePathPrefixes  []string
 	includePathPrefixes []string
 	ignoreQueriesWith   []string
 }
 func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, err error) {
 	// let us ditch anchors
 	anchorParts := strings.Split(linkURL, "#")
@ -32,7 +41,7 @@ func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, e
 	return
 }
-func extractLinksToScrape(
+func filterScrapeLinks(
 	linkList LinkList,
 	baseURL *url.URL,
 	linkNextNormalized string,
--- a/go.mod
+++ b/go.mod
@ -7,6 +7,7 @@ require (
 	github.com/davecgh/go-spew v1.1.1
 	github.com/prometheus/client_golang v1.2.1
 	github.com/stretchr/testify v1.3.0
 	github.com/temoto/robotstxt v1.1.1
 	gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0
 	gopkg.in/yaml.v2 v2.2.2
 )
--- a/go.sum
+++ b/go.sum
@ -66,6 +66,8 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
 github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
 github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
 golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
--- a/robotstxt.go
+++ b/robotstxt.go
@ -0,0 +1,19 @@
 package walker
 import (
 	"net/http"
 	"github.com/temoto/robotstxt"
 )
 func getRobotsData(baseURL string) (data *robotstxt.RobotsData, err error) {
 	resp, errGet := http.Get(baseURL + "/robots.txt")
 	if errGet != nil {
 		return nil, errGet
 	}
 	data, errFromResponse := robotstxt.FromResponse(resp)
 	if errFromResponse != nil {
 		return nil, errFromResponse
 	}
 	return data, nil
 }
--- a/scrape.go
+++ b/scrape.go
@ -50,7 +50,7 @@ func Scrape(pc *poolClient, targetURL string, groupHeader string, chanResult cha
 		chanResult <- result
 		return
 	}
-	req.Header.Set("User-Agent", "foomo-walker")
+	req.Header.Set("User-Agent", pc.agent)
 	resp, errGet := pc.client.Do(req)
 	if errGet != nil {
--- a/scrapeloop.go
+++ b/scrapeloop.go
@ -1,6 +1,7 @@
 package walker
 import (
 	"errors"
 	"fmt"
 	"net"
 	"net/http"
@ -10,85 +11,25 @@ import (
 	"strings"
 	"time"
-	"github.com/prometheus/client_golang/prometheus"
+	"github.com/temoto/robotstxt"
 )
 type poolClient struct {
 	agent  string
 	client *http.Client
 	busy   bool
 }
-func (w *Walker) scrapeloop() {
+type clientPool struct {
-	running := 0
+	agent       string
-	depth := 0
+	concurrency int
-	paging := false
+	useCookies  bool
-	groupHeader := ""
+	clients     []*poolClient
-	ignoreAllQueries := false
+}
 	ignoreRobots := false
 	var jobs map[string]bool
 	var results map[string]ScrapeResult
 	var ignore []string
 	var ignoreQueriesWith []string
 	var baseURL *url.URL
 	paths := []string{}
 	clientPool := []*poolClient{}
 	getBucketList()
-	const prometheusLabelGroup = "group"
+func newClientPool(concurrency int, agent string, useCookies bool) *clientPool {
-	const prometheusLabelStatus = "status"
+	clients := make([]*poolClient, concurrency)
-
+	for i := 0; i < concurrency; i++ {
 	summaryVec := prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
 			Name:       "walker_scrape_durations_seconds",
 			Help:       "scrape duration whole request time including streaming of body",
 			Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
 		},
 		[]string{prometheusLabelGroup},
 	)
 	counterVec := prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Name: "walker_scrape_running_total",
 			Help: "Number of scrapes in scan.",
 		},
 		[]string{prometheusLabelGroup, prometheusLabelStatus},
 	)
 	totalCounter := prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "walker_scrape_counter_total",
 		Help: "number of scrapes since start of walker",
 	})
 	progressGaugeOpen := prometheus.NewGauge(
 		prometheus.GaugeOpts{
 			Name: "walker_progress_gauge_open",
 			Help: "progress open to scrape",
 		},
 	)
 	progressGaugeComplete := prometheus.NewGauge(
 		prometheus.GaugeOpts{
 			Name: "walker_progress_gauge_complete",
 			Help: "progress complete scrapes",
 		},
 	)
 	counterVecStatus := prometheus.NewCounterVec(prometheus.CounterOpts{
 		Name: "walker_progress_status_code_total",
 		Help: "status codes for running scrape",
 	}, []string{prometheusLabelStatus})
 	prometheus.MustRegister(
 		summaryVec,
 		counterVec,
 		totalCounter,
 		progressGaugeComplete,
 		progressGaugeOpen,
 		counterVecStatus,
 	)
 	clientPool = make([]*poolClient, w.concurrency)
 	for i := 0; i < w.concurrency; i++ {
 		client := &http.Client{
 			Timeout: time.Second * 10,
 			Transport: &http.Transport{
@ -98,17 +39,41 @@ func (w *Walker) scrapeloop() {
 				TLSHandshakeTimeout: 5 * time.Second,
 			},
 		}
-		if w.useCookies {
+		if useCookies {
 			cookieJar, _ := cookiejar.New(nil)
 			client.Jar = cookieJar
 		}
-		clientPool[i] = &poolClient{
+		clients[i] = &poolClient{
 			client: client,
 			busy:   false,
 			agent:  agent,
 		}
 	}
 	return &clientPool{
 		agent:       agent,
 		concurrency: concurrency,
 		clients:     clients,
 		useCookies:  useCookies,
 	}
 }
-	start := func(startURL *url.URL, configPaths []string) {
+func (w *Walker) scrapeloop() {
 	summaryVec, counterVec, totalCounter, progressGaugeOpen, progressGaugeComplete, counterVecStatus := setupMetrics()
 	running := 0
 	concurrency := 0
 	groupHeader := ""
 	ignoreRobots := false
 	started := false
 	ll := linkLimitations{}
 	var jobs map[string]bool
 	var results map[string]ScrapeResult
 	var baseURL *url.URL
 	paths := []string{}
 	var cp *clientPool
 	var robotsGroup *robotstxt.Group
 	restart := func(startURL *url.URL, configPaths []string) {
 		started = false
 		summaryVec.Reset()
 		counterVec.Reset()
 		counterVecStatus.Reset()
@ -129,36 +94,36 @@ func (w *Walker) scrapeloop() {
 			jobs = map[string]bool{baseURLString + p + q: false}
 		}
 		results = map[string]ScrapeResult{}
-
+		started = true
 	}
 	for {
-		progressGaugeComplete.Set(float64(len(results)))
+	for {
-		progressGaugeOpen.Set(float64(len(jobs)))
+		if started {
-		if len(jobs) > 0 {
+			progressGaugeComplete.Set(float64(len(results)))
-		JobLoop:
+			progressGaugeOpen.Set(float64(len(jobs)))
-			for jobURL, jobActive := range jobs {
+			if len(jobs) > 0 {
-				if running >= w.concurrency {
+			JobLoop:
-					// concurrency limit
+				for jobURL, jobActive := range jobs {
-					break
+					if running >= concurrency {
-				}
+						// concurrency limit
-				if !jobActive {
+						break
-					for _, poolClient := range clientPool {
+					}
-						if !poolClient.busy {
+					if !jobActive {
-							running++
+						for _, poolClient := range cp.clients {
-							jobs[jobURL] = true
+							if !poolClient.busy {
-							poolClient.busy = true
+								running++
-							// u, _ := url.Parse(jobURL)
+								jobs[jobURL] = true
-							// fmt.Println("got pool client", i, poolClient.client.Jar.Cookies(u))
+								poolClient.busy = true
-							go Scrape(poolClient, jobURL, groupHeader, w.chanResult)
+								go Scrape(poolClient, jobURL, groupHeader, w.chanResult)
-							continue JobLoop
+								continue JobLoop
-						}
+							}
 						}
 						break JobLoop
 					}
 					// fmt.Println("all clients are busy")
 					break JobLoop
 				}
 			}
 		}
 		// time to restart
 		if results != nil && len(jobs) == 0 && running == 0 && baseURL != nil {
 			fmt.Println("restarting", baseURL, paths)
@ -166,27 +131,57 @@ func (w *Walker) scrapeloop() {
 				Results: results,
 				Jobs:    jobs,
 			}
-			start(baseURL, paths)
+			restart(baseURL, paths)
 		}
 		select {
 		case <-time.After(time.Millisecond * 1000):
 			// make sure we do not get stuck
 		case st := <-w.chanStart:
 			robotsGroup = nil
 			groupHeader = st.conf.GroupHeader
-			ignore = st.conf.Ignore
+			concurrency = st.conf.Concurrency
-			depth = st.conf.Depth
+			ll.ignorePathPrefixes = st.conf.Ignore
-			paging = st.conf.Paging
+			ll.depth = st.conf.Depth
 			ll.paging = st.conf.Paging
 			ll.includePathPrefixes = st.conf.Target.Paths
 			ignoreRobots = st.conf.IgnoreRobots
-			ignoreQueriesWith = st.conf.IgnoreQueriesWith
+			ll.ignoreQueriesWith = st.conf.IgnoreQueriesWith
-			ignoreAllQueries = st.conf.IgnoreAllQueries
+			ll.ignoreAllQueries = st.conf.IgnoreAllQueries
 			if cp == nil || cp.agent != st.conf.Agent || cp.concurrency != st.conf.Concurrency || cp.useCookies != st.conf.UseCookies {
 				cp = newClientPool(st.conf.Concurrency, st.conf.Agent, st.conf.UseCookies)
 			}
 			var errStart error
 			startU, errParseStartU := url.Parse(st.conf.Target.BaseURL)
-			if errParseStartU == nil {
+			if errParseStartU != nil {
-				start(startU, st.conf.Target.Paths)
+				errStart = errParseStartU
 			}
 			if errStart == nil && !ignoreRobots {
 				robotsData, errRobotsGroup := getRobotsData(st.conf.Target.BaseURL)
 				if errRobotsGroup == nil {
 					robotsGroup = robotsData.FindGroup(st.conf.Agent)
 					robotForbiddenPath := []string{}
 					for _, p := range st.conf.Target.Paths {
 						if !robotsGroup.Test(p) {
 							robotForbiddenPath = append(robotForbiddenPath, p)
 						}
 					}
 					if len(robotForbiddenPath) > 0 {
 						errStart = errors.New("robots.txt does not allow access to the following path (you can either ignore robots or try as a different user agent): " + strings.Join(robotForbiddenPath, ", "))
 					}
 				} else {
 					errStart = errRobotsGroup
 				}
 			}
 			if errStart == nil {
 				restart(startU, st.conf.Target.Paths)
 				w.chanErrStart <- nil
 			} else {
-				w.chanErrStart <- errParseStartU
+				w.chanErrStart <- errStart
 			}
 		case <-w.chanStatus:
 			resultsCopy := make(map[string]ScrapeResult, len(results))
 			jobsCopy := make(map[string]bool, len(jobs))
@ -246,110 +241,31 @@ func (w *Walker) scrapeloop() {
 			totalCounter.Inc()
 			if ignoreRobots || !strings.Contains(scanResult.Structure.Robots, "nofollow") {
 				linkNextNormalized := ""
 				linkPrevNormalized := ""
 				// should we follow the links
-				for linkURL := range scanResult.Links {
+				linkNextNormalizedURL, errNormalizeNext := normalizeLink(baseURL, scanResult.Structure.LinkNext)
 				if errNormalizeNext == nil {
 					linkNextNormalized = linkNextNormalizedURL.String()
 				}
 				linkPrevNormalizedURL, errNormalizedPrev := normalizeLink(baseURL, scanResult.Structure.LinkPrev)
 				if errNormalizedPrev == nil {
 					linkPrevNormalized = linkPrevNormalizedURL.String()
 				}
-					// is it a pager link
+				linksToScrape := filterScrapeLinks(scanResult.Links, baseURL, linkNextNormalized, linkPrevNormalized, ll, robotsGroup)
-					// this might want to be normalized
+				for linkToScrape := range linksToScrape {
-					isPagerLink := scanResult.Structure.LinkNext == linkURL || scanResult.Structure.LinkPrev == linkURL
+					// scanResult.Links[linkU.String()] = scanResult.Links[linkURL]
-					if !paging && isPagerLink {
+					// linkURL = linkU.String()
-						continue
+					_, existingResultOK := results[linkToScrape]
-					}
+					_, existingJobOK := jobs[linkToScrape]
-
+					if !existingResultOK && !existingJobOK {
-					// ok, time to really look at that url
+						jobs[linkToScrape] = false
 					linkU, errParseLinkU := normalizeLink(baseURL, linkURL)
 					if errParseLinkU == nil {
 						// to be ignored ?!
 						ignoreLink := false
 						if len(linkU.Query()) > 0 {
 							// it has a query
 							if ignoreAllQueries {
 								// no queries in general
 								ignoreLink = true
 							} else {
 								// do we filter a query parameter
 							IgnoreLoop:
 								for _, ignoreP := range ignoreQueriesWith {
 									for pName := range linkU.Query() {
 										if pName == ignoreP {
 											ignoreLink = true
 											break IgnoreLoop
 										}
 									}
 								}
 							}
 						}
 						if !ignoreLink {
 							foundPath := false
 							for _, p := range paths {
 								if strings.HasPrefix(linkU.Path, p) {
 									foundPath = true
 									break
 								}
 							}
 							if !foundPath {
 								// not in the scrape path
 								ignoreLink = true
 							}
 						}
 						if !ignoreLink && depth > 0 {
 							// too deep?
 							linkDepth := len(strings.Split(linkU.Path, "/")) - 1
 							ignoreLink = linkDepth > depth
 							if ignoreLink {
 								fmt.Println("ignoring", linkU.Path, depth, linkDepth)
 							}
 						}
 						// ignore prefix
 						if !ignoreLink {
 							for _, ignorePrefix := range ignore {
 								if strings.HasPrefix(linkU.Path, ignorePrefix) {
 									ignoreLink = true
 									break
 								}
 							}
 						}
 						if !ignoreLink && linkU.Host == baseURL.Host &&
 							linkU.Scheme == baseURL.Scheme {
 							scanResult.Links[linkU.String()] = scanResult.Links[linkURL]
 							linkURL = linkU.String()
 							_, existingResultOK := results[linkURL]
 							_, existingJobOK := jobs[linkURL]
 							if !existingResultOK && !existingJobOK {
 								jobs[linkURL] = false
 							}
 						}
 					}
 				}
 			}
 		}
 	}
 }
 func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, err error) {
 	// let us ditch anchors
 	anchorParts := strings.Split(linkURL, "#")
 	linkURL = anchorParts[0]
 	link, errParseLink := url.Parse(linkURL)
 	if errParseLink != nil {
 		err = errParseLink
 		return
 	}
 	// host
 	if link.Host == "" {
 		link.Host = baseURL.Host
 	}
 	// scheme
 	if link.Scheme == "" || link.Scheme == "//" {
 		link.Scheme = baseURL.Scheme
 	}
 	if baseURL.User != nil {
 		link.User = baseURL.User
 	}
 	// it is beautiful now
 	normalizedLink = link
 	return
 }
--- a/service.go
+++ b/service.go
@ -15,8 +15,11 @@ type Service struct {
 }
 func NewService(conf *config.Config) (s *Service, err error) {
-	w := NewWalker(conf.Concurrency, conf.UseCookies)
+	w := NewWalker()
-	w.walk(conf)
+	errWalk := w.walk(conf)
 	if errWalk != nil {
 		return nil, errWalk
 	}
 	s = &Service{
 		Walker: w,
 		// targetURL: conf.Target,