initial robots.txt support

This commit is contained in:
Jan Halfar 2020-03-11 14:36:55 +01:00
parent df5259b447
commit 91dc1fca07
8 changed files with 160 additions and 215 deletions

View File

@ -1,5 +0,0 @@
target: http://www.bestbytes.de
concurrency: 2
addr: ":3001"
ignore:
- /foomo

View File

@ -7,6 +7,15 @@ import (
"github.com/temoto/robotstxt" "github.com/temoto/robotstxt"
) )
type linkLimitations struct {
depth int
paging bool
ignoreAllQueries bool
ignorePathPrefixes []string
includePathPrefixes []string
ignoreQueriesWith []string
}
func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, err error) { func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, err error) {
// let us ditch anchors // let us ditch anchors
anchorParts := strings.Split(linkURL, "#") anchorParts := strings.Split(linkURL, "#")
@ -32,7 +41,7 @@ func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, e
return return
} }
func extractLinksToScrape( func filterScrapeLinks(
linkList LinkList, linkList LinkList,
baseURL *url.URL, baseURL *url.URL,
linkNextNormalized string, linkNextNormalized string,

1
go.mod
View File

@ -7,6 +7,7 @@ require (
github.com/davecgh/go-spew v1.1.1 github.com/davecgh/go-spew v1.1.1
github.com/prometheus/client_golang v1.2.1 github.com/prometheus/client_golang v1.2.1
github.com/stretchr/testify v1.3.0 github.com/stretchr/testify v1.3.0
github.com/temoto/robotstxt v1.1.1
gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0 gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0
gopkg.in/yaml.v2 v2.2.2 gopkg.in/yaml.v2 v2.2.2
) )

2
go.sum
View File

@ -66,6 +66,8 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=

19
robotstxt.go Normal file
View File

@ -0,0 +1,19 @@
package walker
import (
"net/http"
"github.com/temoto/robotstxt"
)
func getRobotsData(baseURL string) (data *robotstxt.RobotsData, err error) {
resp, errGet := http.Get(baseURL + "/robots.txt")
if errGet != nil {
return nil, errGet
}
data, errFromResponse := robotstxt.FromResponse(resp)
if errFromResponse != nil {
return nil, errFromResponse
}
return data, nil
}

View File

@ -50,7 +50,7 @@ func Scrape(pc *poolClient, targetURL string, groupHeader string, chanResult cha
chanResult <- result chanResult <- result
return return
} }
req.Header.Set("User-Agent", "foomo-walker") req.Header.Set("User-Agent", pc.agent)
resp, errGet := pc.client.Do(req) resp, errGet := pc.client.Do(req)
if errGet != nil { if errGet != nil {

View File

@ -1,6 +1,7 @@
package walker package walker
import ( import (
"errors"
"fmt" "fmt"
"net" "net"
"net/http" "net/http"
@ -10,85 +11,25 @@ import (
"strings" "strings"
"time" "time"
"github.com/prometheus/client_golang/prometheus" "github.com/temoto/robotstxt"
) )
type poolClient struct { type poolClient struct {
agent string
client *http.Client client *http.Client
busy bool busy bool
} }
func (w *Walker) scrapeloop() { type clientPool struct {
running := 0 agent string
depth := 0 concurrency int
paging := false useCookies bool
groupHeader := "" clients []*poolClient
ignoreAllQueries := false }
ignoreRobots := false
var jobs map[string]bool
var results map[string]ScrapeResult
var ignore []string
var ignoreQueriesWith []string
var baseURL *url.URL
paths := []string{}
clientPool := []*poolClient{}
getBucketList()
const prometheusLabelGroup = "group" func newClientPool(concurrency int, agent string, useCookies bool) *clientPool {
const prometheusLabelStatus = "status" clients := make([]*poolClient, concurrency)
for i := 0; i < concurrency; i++ {
summaryVec := prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "walker_scrape_durations_seconds",
Help: "scrape duration whole request time including streaming of body",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{prometheusLabelGroup},
)
counterVec := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "walker_scrape_running_total",
Help: "Number of scrapes in scan.",
},
[]string{prometheusLabelGroup, prometheusLabelStatus},
)
totalCounter := prometheus.NewCounter(prometheus.CounterOpts{
Name: "walker_scrape_counter_total",
Help: "number of scrapes since start of walker",
})
progressGaugeOpen := prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "walker_progress_gauge_open",
Help: "progress open to scrape",
},
)
progressGaugeComplete := prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "walker_progress_gauge_complete",
Help: "progress complete scrapes",
},
)
counterVecStatus := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "walker_progress_status_code_total",
Help: "status codes for running scrape",
}, []string{prometheusLabelStatus})
prometheus.MustRegister(
summaryVec,
counterVec,
totalCounter,
progressGaugeComplete,
progressGaugeOpen,
counterVecStatus,
)
clientPool = make([]*poolClient, w.concurrency)
for i := 0; i < w.concurrency; i++ {
client := &http.Client{ client := &http.Client{
Timeout: time.Second * 10, Timeout: time.Second * 10,
Transport: &http.Transport{ Transport: &http.Transport{
@ -98,17 +39,41 @@ func (w *Walker) scrapeloop() {
TLSHandshakeTimeout: 5 * time.Second, TLSHandshakeTimeout: 5 * time.Second,
}, },
} }
if w.useCookies { if useCookies {
cookieJar, _ := cookiejar.New(nil) cookieJar, _ := cookiejar.New(nil)
client.Jar = cookieJar client.Jar = cookieJar
} }
clientPool[i] = &poolClient{ clients[i] = &poolClient{
client: client, client: client,
busy: false, busy: false,
agent: agent,
} }
} }
return &clientPool{
agent: agent,
concurrency: concurrency,
clients: clients,
useCookies: useCookies,
}
}
start := func(startURL *url.URL, configPaths []string) { func (w *Walker) scrapeloop() {
summaryVec, counterVec, totalCounter, progressGaugeOpen, progressGaugeComplete, counterVecStatus := setupMetrics()
running := 0
concurrency := 0
groupHeader := ""
ignoreRobots := false
started := false
ll := linkLimitations{}
var jobs map[string]bool
var results map[string]ScrapeResult
var baseURL *url.URL
paths := []string{}
var cp *clientPool
var robotsGroup *robotstxt.Group
restart := func(startURL *url.URL, configPaths []string) {
started = false
summaryVec.Reset() summaryVec.Reset()
counterVec.Reset() counterVec.Reset()
counterVecStatus.Reset() counterVecStatus.Reset()
@ -129,36 +94,36 @@ func (w *Walker) scrapeloop() {
jobs = map[string]bool{baseURLString + p + q: false} jobs = map[string]bool{baseURLString + p + q: false}
} }
results = map[string]ScrapeResult{} results = map[string]ScrapeResult{}
started = true
} }
for {
progressGaugeComplete.Set(float64(len(results))) for {
progressGaugeOpen.Set(float64(len(jobs))) if started {
if len(jobs) > 0 { progressGaugeComplete.Set(float64(len(results)))
JobLoop: progressGaugeOpen.Set(float64(len(jobs)))
for jobURL, jobActive := range jobs { if len(jobs) > 0 {
if running >= w.concurrency { JobLoop:
// concurrency limit for jobURL, jobActive := range jobs {
break if running >= concurrency {
} // concurrency limit
if !jobActive { break
for _, poolClient := range clientPool { }
if !poolClient.busy { if !jobActive {
running++ for _, poolClient := range cp.clients {
jobs[jobURL] = true if !poolClient.busy {
poolClient.busy = true running++
// u, _ := url.Parse(jobURL) jobs[jobURL] = true
// fmt.Println("got pool client", i, poolClient.client.Jar.Cookies(u)) poolClient.busy = true
go Scrape(poolClient, jobURL, groupHeader, w.chanResult) go Scrape(poolClient, jobURL, groupHeader, w.chanResult)
continue JobLoop continue JobLoop
} }
}
break JobLoop
} }
// fmt.Println("all clients are busy")
break JobLoop
} }
} }
} }
// time to restart // time to restart
if results != nil && len(jobs) == 0 && running == 0 && baseURL != nil { if results != nil && len(jobs) == 0 && running == 0 && baseURL != nil {
fmt.Println("restarting", baseURL, paths) fmt.Println("restarting", baseURL, paths)
@ -166,27 +131,57 @@ func (w *Walker) scrapeloop() {
Results: results, Results: results,
Jobs: jobs, Jobs: jobs,
} }
start(baseURL, paths) restart(baseURL, paths)
} }
select { select {
case <-time.After(time.Millisecond * 1000): case <-time.After(time.Millisecond * 1000):
// make sure we do not get stuck // make sure we do not get stuck
case st := <-w.chanStart: case st := <-w.chanStart:
robotsGroup = nil
groupHeader = st.conf.GroupHeader groupHeader = st.conf.GroupHeader
ignore = st.conf.Ignore concurrency = st.conf.Concurrency
depth = st.conf.Depth ll.ignorePathPrefixes = st.conf.Ignore
paging = st.conf.Paging ll.depth = st.conf.Depth
ll.paging = st.conf.Paging
ll.includePathPrefixes = st.conf.Target.Paths
ignoreRobots = st.conf.IgnoreRobots ignoreRobots = st.conf.IgnoreRobots
ignoreQueriesWith = st.conf.IgnoreQueriesWith ll.ignoreQueriesWith = st.conf.IgnoreQueriesWith
ignoreAllQueries = st.conf.IgnoreAllQueries ll.ignoreAllQueries = st.conf.IgnoreAllQueries
if cp == nil || cp.agent != st.conf.Agent || cp.concurrency != st.conf.Concurrency || cp.useCookies != st.conf.UseCookies {
cp = newClientPool(st.conf.Concurrency, st.conf.Agent, st.conf.UseCookies)
}
var errStart error
startU, errParseStartU := url.Parse(st.conf.Target.BaseURL) startU, errParseStartU := url.Parse(st.conf.Target.BaseURL)
if errParseStartU == nil { if errParseStartU != nil {
start(startU, st.conf.Target.Paths) errStart = errParseStartU
}
if errStart == nil && !ignoreRobots {
robotsData, errRobotsGroup := getRobotsData(st.conf.Target.BaseURL)
if errRobotsGroup == nil {
robotsGroup = robotsData.FindGroup(st.conf.Agent)
robotForbiddenPath := []string{}
for _, p := range st.conf.Target.Paths {
if !robotsGroup.Test(p) {
robotForbiddenPath = append(robotForbiddenPath, p)
}
}
if len(robotForbiddenPath) > 0 {
errStart = errors.New("robots.txt does not allow access to the following path (you can either ignore robots or try as a different user agent): " + strings.Join(robotForbiddenPath, ", "))
}
} else {
errStart = errRobotsGroup
}
}
if errStart == nil {
restart(startU, st.conf.Target.Paths)
w.chanErrStart <- nil w.chanErrStart <- nil
} else { } else {
w.chanErrStart <- errParseStartU w.chanErrStart <- errStart
} }
case <-w.chanStatus: case <-w.chanStatus:
resultsCopy := make(map[string]ScrapeResult, len(results)) resultsCopy := make(map[string]ScrapeResult, len(results))
jobsCopy := make(map[string]bool, len(jobs)) jobsCopy := make(map[string]bool, len(jobs))
@ -246,110 +241,31 @@ func (w *Walker) scrapeloop() {
totalCounter.Inc() totalCounter.Inc()
if ignoreRobots || !strings.Contains(scanResult.Structure.Robots, "nofollow") { if ignoreRobots || !strings.Contains(scanResult.Structure.Robots, "nofollow") {
linkNextNormalized := ""
linkPrevNormalized := ""
// should we follow the links // should we follow the links
for linkURL := range scanResult.Links { linkNextNormalizedURL, errNormalizeNext := normalizeLink(baseURL, scanResult.Structure.LinkNext)
if errNormalizeNext == nil {
linkNextNormalized = linkNextNormalizedURL.String()
}
linkPrevNormalizedURL, errNormalizedPrev := normalizeLink(baseURL, scanResult.Structure.LinkPrev)
if errNormalizedPrev == nil {
linkPrevNormalized = linkPrevNormalizedURL.String()
}
// is it a pager link linksToScrape := filterScrapeLinks(scanResult.Links, baseURL, linkNextNormalized, linkPrevNormalized, ll, robotsGroup)
// this might want to be normalized for linkToScrape := range linksToScrape {
isPagerLink := scanResult.Structure.LinkNext == linkURL || scanResult.Structure.LinkPrev == linkURL // scanResult.Links[linkU.String()] = scanResult.Links[linkURL]
if !paging && isPagerLink { // linkURL = linkU.String()
continue _, existingResultOK := results[linkToScrape]
} _, existingJobOK := jobs[linkToScrape]
if !existingResultOK && !existingJobOK {
// ok, time to really look at that url jobs[linkToScrape] = false
linkU, errParseLinkU := normalizeLink(baseURL, linkURL)
if errParseLinkU == nil {
// to be ignored ?!
ignoreLink := false
if len(linkU.Query()) > 0 {
// it has a query
if ignoreAllQueries {
// no queries in general
ignoreLink = true
} else {
// do we filter a query parameter
IgnoreLoop:
for _, ignoreP := range ignoreQueriesWith {
for pName := range linkU.Query() {
if pName == ignoreP {
ignoreLink = true
break IgnoreLoop
}
}
}
}
}
if !ignoreLink {
foundPath := false
for _, p := range paths {
if strings.HasPrefix(linkU.Path, p) {
foundPath = true
break
}
}
if !foundPath {
// not in the scrape path
ignoreLink = true
}
}
if !ignoreLink && depth > 0 {
// too deep?
linkDepth := len(strings.Split(linkU.Path, "/")) - 1
ignoreLink = linkDepth > depth
if ignoreLink {
fmt.Println("ignoring", linkU.Path, depth, linkDepth)
}
}
// ignore prefix
if !ignoreLink {
for _, ignorePrefix := range ignore {
if strings.HasPrefix(linkU.Path, ignorePrefix) {
ignoreLink = true
break
}
}
}
if !ignoreLink && linkU.Host == baseURL.Host &&
linkU.Scheme == baseURL.Scheme {
scanResult.Links[linkU.String()] = scanResult.Links[linkURL]
linkURL = linkU.String()
_, existingResultOK := results[linkURL]
_, existingJobOK := jobs[linkURL]
if !existingResultOK && !existingJobOK {
jobs[linkURL] = false
}
}
} }
} }
} }
} }
} }
} }
func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, err error) {
// let us ditch anchors
anchorParts := strings.Split(linkURL, "#")
linkURL = anchorParts[0]
link, errParseLink := url.Parse(linkURL)
if errParseLink != nil {
err = errParseLink
return
}
// host
if link.Host == "" {
link.Host = baseURL.Host
}
// scheme
if link.Scheme == "" || link.Scheme == "//" {
link.Scheme = baseURL.Scheme
}
if baseURL.User != nil {
link.User = baseURL.User
}
// it is beautiful now
normalizedLink = link
return
}

View File

@ -15,8 +15,11 @@ type Service struct {
} }
func NewService(conf *config.Config) (s *Service, err error) { func NewService(conf *config.Config) (s *Service, err error) {
w := NewWalker(conf.Concurrency, conf.UseCookies) w := NewWalker()
w.walk(conf) errWalk := w.walk(conf)
if errWalk != nil {
return nil, errWalk
}
s = &Service{ s = &Service{
Walker: w, Walker: w,
// targetURL: conf.Target, // targetURL: conf.Target,