mirror of
https://github.com/foomo/walker.git
synced 2025-10-16 12:45:39 +00:00
initial robots.txt support
This commit is contained in:
parent
df5259b447
commit
91dc1fca07
@ -1,5 +0,0 @@
|
|||||||
target: http://www.bestbytes.de
|
|
||||||
concurrency: 2
|
|
||||||
addr: ":3001"
|
|
||||||
ignore:
|
|
||||||
- /foomo
|
|
||||||
@ -7,6 +7,15 @@ import (
|
|||||||
"github.com/temoto/robotstxt"
|
"github.com/temoto/robotstxt"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type linkLimitations struct {
|
||||||
|
depth int
|
||||||
|
paging bool
|
||||||
|
ignoreAllQueries bool
|
||||||
|
ignorePathPrefixes []string
|
||||||
|
includePathPrefixes []string
|
||||||
|
ignoreQueriesWith []string
|
||||||
|
}
|
||||||
|
|
||||||
func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, err error) {
|
func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, err error) {
|
||||||
// let us ditch anchors
|
// let us ditch anchors
|
||||||
anchorParts := strings.Split(linkURL, "#")
|
anchorParts := strings.Split(linkURL, "#")
|
||||||
@ -32,7 +41,7 @@ func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, e
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractLinksToScrape(
|
func filterScrapeLinks(
|
||||||
linkList LinkList,
|
linkList LinkList,
|
||||||
baseURL *url.URL,
|
baseURL *url.URL,
|
||||||
linkNextNormalized string,
|
linkNextNormalized string,
|
||||||
1
go.mod
1
go.mod
@ -7,6 +7,7 @@ require (
|
|||||||
github.com/davecgh/go-spew v1.1.1
|
github.com/davecgh/go-spew v1.1.1
|
||||||
github.com/prometheus/client_golang v1.2.1
|
github.com/prometheus/client_golang v1.2.1
|
||||||
github.com/stretchr/testify v1.3.0
|
github.com/stretchr/testify v1.3.0
|
||||||
|
github.com/temoto/robotstxt v1.1.1
|
||||||
gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0
|
gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0
|
||||||
gopkg.in/yaml.v2 v2.2.2
|
gopkg.in/yaml.v2 v2.2.2
|
||||||
)
|
)
|
||||||
|
|||||||
2
go.sum
2
go.sum
@ -66,6 +66,8 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
|
|||||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
|
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
|
||||||
|
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||||
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||||
|
|||||||
19
robotstxt.go
Normal file
19
robotstxt.go
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
package walker
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
|
||||||
|
"github.com/temoto/robotstxt"
|
||||||
|
)
|
||||||
|
|
||||||
|
func getRobotsData(baseURL string) (data *robotstxt.RobotsData, err error) {
|
||||||
|
resp, errGet := http.Get(baseURL + "/robots.txt")
|
||||||
|
if errGet != nil {
|
||||||
|
return nil, errGet
|
||||||
|
}
|
||||||
|
data, errFromResponse := robotstxt.FromResponse(resp)
|
||||||
|
if errFromResponse != nil {
|
||||||
|
return nil, errFromResponse
|
||||||
|
}
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
@ -50,7 +50,7 @@ func Scrape(pc *poolClient, targetURL string, groupHeader string, chanResult cha
|
|||||||
chanResult <- result
|
chanResult <- result
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
req.Header.Set("User-Agent", "foomo-walker")
|
req.Header.Set("User-Agent", pc.agent)
|
||||||
|
|
||||||
resp, errGet := pc.client.Do(req)
|
resp, errGet := pc.client.Do(req)
|
||||||
if errGet != nil {
|
if errGet != nil {
|
||||||
|
|||||||
328
scrapeloop.go
328
scrapeloop.go
@ -1,6 +1,7 @@
|
|||||||
package walker
|
package walker
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
@ -10,85 +11,25 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/temoto/robotstxt"
|
||||||
)
|
)
|
||||||
|
|
||||||
type poolClient struct {
|
type poolClient struct {
|
||||||
|
agent string
|
||||||
client *http.Client
|
client *http.Client
|
||||||
busy bool
|
busy bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *Walker) scrapeloop() {
|
type clientPool struct {
|
||||||
running := 0
|
agent string
|
||||||
depth := 0
|
concurrency int
|
||||||
paging := false
|
useCookies bool
|
||||||
groupHeader := ""
|
clients []*poolClient
|
||||||
ignoreAllQueries := false
|
}
|
||||||
ignoreRobots := false
|
|
||||||
var jobs map[string]bool
|
|
||||||
var results map[string]ScrapeResult
|
|
||||||
var ignore []string
|
|
||||||
var ignoreQueriesWith []string
|
|
||||||
var baseURL *url.URL
|
|
||||||
paths := []string{}
|
|
||||||
clientPool := []*poolClient{}
|
|
||||||
getBucketList()
|
|
||||||
|
|
||||||
const prometheusLabelGroup = "group"
|
func newClientPool(concurrency int, agent string, useCookies bool) *clientPool {
|
||||||
const prometheusLabelStatus = "status"
|
clients := make([]*poolClient, concurrency)
|
||||||
|
for i := 0; i < concurrency; i++ {
|
||||||
summaryVec := prometheus.NewSummaryVec(
|
|
||||||
prometheus.SummaryOpts{
|
|
||||||
Name: "walker_scrape_durations_seconds",
|
|
||||||
Help: "scrape duration whole request time including streaming of body",
|
|
||||||
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
|
|
||||||
},
|
|
||||||
[]string{prometheusLabelGroup},
|
|
||||||
)
|
|
||||||
|
|
||||||
counterVec := prometheus.NewCounterVec(
|
|
||||||
prometheus.CounterOpts{
|
|
||||||
Name: "walker_scrape_running_total",
|
|
||||||
Help: "Number of scrapes in scan.",
|
|
||||||
},
|
|
||||||
[]string{prometheusLabelGroup, prometheusLabelStatus},
|
|
||||||
)
|
|
||||||
|
|
||||||
totalCounter := prometheus.NewCounter(prometheus.CounterOpts{
|
|
||||||
Name: "walker_scrape_counter_total",
|
|
||||||
Help: "number of scrapes since start of walker",
|
|
||||||
})
|
|
||||||
|
|
||||||
progressGaugeOpen := prometheus.NewGauge(
|
|
||||||
prometheus.GaugeOpts{
|
|
||||||
Name: "walker_progress_gauge_open",
|
|
||||||
Help: "progress open to scrape",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
progressGaugeComplete := prometheus.NewGauge(
|
|
||||||
prometheus.GaugeOpts{
|
|
||||||
Name: "walker_progress_gauge_complete",
|
|
||||||
Help: "progress complete scrapes",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
counterVecStatus := prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Name: "walker_progress_status_code_total",
|
|
||||||
Help: "status codes for running scrape",
|
|
||||||
}, []string{prometheusLabelStatus})
|
|
||||||
|
|
||||||
prometheus.MustRegister(
|
|
||||||
summaryVec,
|
|
||||||
counterVec,
|
|
||||||
totalCounter,
|
|
||||||
progressGaugeComplete,
|
|
||||||
progressGaugeOpen,
|
|
||||||
counterVecStatus,
|
|
||||||
)
|
|
||||||
|
|
||||||
clientPool = make([]*poolClient, w.concurrency)
|
|
||||||
for i := 0; i < w.concurrency; i++ {
|
|
||||||
client := &http.Client{
|
client := &http.Client{
|
||||||
Timeout: time.Second * 10,
|
Timeout: time.Second * 10,
|
||||||
Transport: &http.Transport{
|
Transport: &http.Transport{
|
||||||
@ -98,17 +39,41 @@ func (w *Walker) scrapeloop() {
|
|||||||
TLSHandshakeTimeout: 5 * time.Second,
|
TLSHandshakeTimeout: 5 * time.Second,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
if w.useCookies {
|
if useCookies {
|
||||||
cookieJar, _ := cookiejar.New(nil)
|
cookieJar, _ := cookiejar.New(nil)
|
||||||
client.Jar = cookieJar
|
client.Jar = cookieJar
|
||||||
}
|
}
|
||||||
clientPool[i] = &poolClient{
|
clients[i] = &poolClient{
|
||||||
client: client,
|
client: client,
|
||||||
busy: false,
|
busy: false,
|
||||||
|
agent: agent,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return &clientPool{
|
||||||
|
agent: agent,
|
||||||
|
concurrency: concurrency,
|
||||||
|
clients: clients,
|
||||||
|
useCookies: useCookies,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
start := func(startURL *url.URL, configPaths []string) {
|
func (w *Walker) scrapeloop() {
|
||||||
|
summaryVec, counterVec, totalCounter, progressGaugeOpen, progressGaugeComplete, counterVecStatus := setupMetrics()
|
||||||
|
running := 0
|
||||||
|
concurrency := 0
|
||||||
|
groupHeader := ""
|
||||||
|
ignoreRobots := false
|
||||||
|
started := false
|
||||||
|
ll := linkLimitations{}
|
||||||
|
var jobs map[string]bool
|
||||||
|
var results map[string]ScrapeResult
|
||||||
|
var baseURL *url.URL
|
||||||
|
paths := []string{}
|
||||||
|
var cp *clientPool
|
||||||
|
var robotsGroup *robotstxt.Group
|
||||||
|
|
||||||
|
restart := func(startURL *url.URL, configPaths []string) {
|
||||||
|
started = false
|
||||||
summaryVec.Reset()
|
summaryVec.Reset()
|
||||||
counterVec.Reset()
|
counterVec.Reset()
|
||||||
counterVecStatus.Reset()
|
counterVecStatus.Reset()
|
||||||
@ -129,36 +94,36 @@ func (w *Walker) scrapeloop() {
|
|||||||
jobs = map[string]bool{baseURLString + p + q: false}
|
jobs = map[string]bool{baseURLString + p + q: false}
|
||||||
}
|
}
|
||||||
results = map[string]ScrapeResult{}
|
results = map[string]ScrapeResult{}
|
||||||
|
started = true
|
||||||
}
|
}
|
||||||
for {
|
|
||||||
|
|
||||||
progressGaugeComplete.Set(float64(len(results)))
|
for {
|
||||||
progressGaugeOpen.Set(float64(len(jobs)))
|
if started {
|
||||||
if len(jobs) > 0 {
|
progressGaugeComplete.Set(float64(len(results)))
|
||||||
JobLoop:
|
progressGaugeOpen.Set(float64(len(jobs)))
|
||||||
for jobURL, jobActive := range jobs {
|
if len(jobs) > 0 {
|
||||||
if running >= w.concurrency {
|
JobLoop:
|
||||||
// concurrency limit
|
for jobURL, jobActive := range jobs {
|
||||||
break
|
if running >= concurrency {
|
||||||
}
|
// concurrency limit
|
||||||
if !jobActive {
|
break
|
||||||
for _, poolClient := range clientPool {
|
}
|
||||||
if !poolClient.busy {
|
if !jobActive {
|
||||||
running++
|
for _, poolClient := range cp.clients {
|
||||||
jobs[jobURL] = true
|
if !poolClient.busy {
|
||||||
poolClient.busy = true
|
running++
|
||||||
// u, _ := url.Parse(jobURL)
|
jobs[jobURL] = true
|
||||||
// fmt.Println("got pool client", i, poolClient.client.Jar.Cookies(u))
|
poolClient.busy = true
|
||||||
go Scrape(poolClient, jobURL, groupHeader, w.chanResult)
|
go Scrape(poolClient, jobURL, groupHeader, w.chanResult)
|
||||||
continue JobLoop
|
continue JobLoop
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
break JobLoop
|
||||||
}
|
}
|
||||||
// fmt.Println("all clients are busy")
|
|
||||||
break JobLoop
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// time to restart
|
// time to restart
|
||||||
if results != nil && len(jobs) == 0 && running == 0 && baseURL != nil {
|
if results != nil && len(jobs) == 0 && running == 0 && baseURL != nil {
|
||||||
fmt.Println("restarting", baseURL, paths)
|
fmt.Println("restarting", baseURL, paths)
|
||||||
@ -166,27 +131,57 @@ func (w *Walker) scrapeloop() {
|
|||||||
Results: results,
|
Results: results,
|
||||||
Jobs: jobs,
|
Jobs: jobs,
|
||||||
}
|
}
|
||||||
start(baseURL, paths)
|
restart(baseURL, paths)
|
||||||
}
|
}
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-time.After(time.Millisecond * 1000):
|
case <-time.After(time.Millisecond * 1000):
|
||||||
// make sure we do not get stuck
|
// make sure we do not get stuck
|
||||||
case st := <-w.chanStart:
|
case st := <-w.chanStart:
|
||||||
|
robotsGroup = nil
|
||||||
groupHeader = st.conf.GroupHeader
|
groupHeader = st.conf.GroupHeader
|
||||||
ignore = st.conf.Ignore
|
concurrency = st.conf.Concurrency
|
||||||
depth = st.conf.Depth
|
ll.ignorePathPrefixes = st.conf.Ignore
|
||||||
paging = st.conf.Paging
|
ll.depth = st.conf.Depth
|
||||||
|
ll.paging = st.conf.Paging
|
||||||
|
ll.includePathPrefixes = st.conf.Target.Paths
|
||||||
ignoreRobots = st.conf.IgnoreRobots
|
ignoreRobots = st.conf.IgnoreRobots
|
||||||
ignoreQueriesWith = st.conf.IgnoreQueriesWith
|
ll.ignoreQueriesWith = st.conf.IgnoreQueriesWith
|
||||||
ignoreAllQueries = st.conf.IgnoreAllQueries
|
ll.ignoreAllQueries = st.conf.IgnoreAllQueries
|
||||||
|
|
||||||
|
if cp == nil || cp.agent != st.conf.Agent || cp.concurrency != st.conf.Concurrency || cp.useCookies != st.conf.UseCookies {
|
||||||
|
cp = newClientPool(st.conf.Concurrency, st.conf.Agent, st.conf.UseCookies)
|
||||||
|
}
|
||||||
|
|
||||||
|
var errStart error
|
||||||
startU, errParseStartU := url.Parse(st.conf.Target.BaseURL)
|
startU, errParseStartU := url.Parse(st.conf.Target.BaseURL)
|
||||||
if errParseStartU == nil {
|
if errParseStartU != nil {
|
||||||
start(startU, st.conf.Target.Paths)
|
errStart = errParseStartU
|
||||||
|
}
|
||||||
|
if errStart == nil && !ignoreRobots {
|
||||||
|
robotsData, errRobotsGroup := getRobotsData(st.conf.Target.BaseURL)
|
||||||
|
if errRobotsGroup == nil {
|
||||||
|
robotsGroup = robotsData.FindGroup(st.conf.Agent)
|
||||||
|
robotForbiddenPath := []string{}
|
||||||
|
for _, p := range st.conf.Target.Paths {
|
||||||
|
if !robotsGroup.Test(p) {
|
||||||
|
robotForbiddenPath = append(robotForbiddenPath, p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(robotForbiddenPath) > 0 {
|
||||||
|
errStart = errors.New("robots.txt does not allow access to the following path (you can either ignore robots or try as a different user agent): " + strings.Join(robotForbiddenPath, ", "))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
errStart = errRobotsGroup
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if errStart == nil {
|
||||||
|
restart(startU, st.conf.Target.Paths)
|
||||||
w.chanErrStart <- nil
|
w.chanErrStart <- nil
|
||||||
} else {
|
} else {
|
||||||
w.chanErrStart <- errParseStartU
|
w.chanErrStart <- errStart
|
||||||
}
|
}
|
||||||
|
|
||||||
case <-w.chanStatus:
|
case <-w.chanStatus:
|
||||||
resultsCopy := make(map[string]ScrapeResult, len(results))
|
resultsCopy := make(map[string]ScrapeResult, len(results))
|
||||||
jobsCopy := make(map[string]bool, len(jobs))
|
jobsCopy := make(map[string]bool, len(jobs))
|
||||||
@ -246,110 +241,31 @@ func (w *Walker) scrapeloop() {
|
|||||||
totalCounter.Inc()
|
totalCounter.Inc()
|
||||||
|
|
||||||
if ignoreRobots || !strings.Contains(scanResult.Structure.Robots, "nofollow") {
|
if ignoreRobots || !strings.Contains(scanResult.Structure.Robots, "nofollow") {
|
||||||
|
linkNextNormalized := ""
|
||||||
|
linkPrevNormalized := ""
|
||||||
// should we follow the links
|
// should we follow the links
|
||||||
for linkURL := range scanResult.Links {
|
linkNextNormalizedURL, errNormalizeNext := normalizeLink(baseURL, scanResult.Structure.LinkNext)
|
||||||
|
if errNormalizeNext == nil {
|
||||||
|
linkNextNormalized = linkNextNormalizedURL.String()
|
||||||
|
}
|
||||||
|
linkPrevNormalizedURL, errNormalizedPrev := normalizeLink(baseURL, scanResult.Structure.LinkPrev)
|
||||||
|
if errNormalizedPrev == nil {
|
||||||
|
linkPrevNormalized = linkPrevNormalizedURL.String()
|
||||||
|
}
|
||||||
|
|
||||||
// is it a pager link
|
linksToScrape := filterScrapeLinks(scanResult.Links, baseURL, linkNextNormalized, linkPrevNormalized, ll, robotsGroup)
|
||||||
// this might want to be normalized
|
for linkToScrape := range linksToScrape {
|
||||||
isPagerLink := scanResult.Structure.LinkNext == linkURL || scanResult.Structure.LinkPrev == linkURL
|
// scanResult.Links[linkU.String()] = scanResult.Links[linkURL]
|
||||||
if !paging && isPagerLink {
|
// linkURL = linkU.String()
|
||||||
continue
|
_, existingResultOK := results[linkToScrape]
|
||||||
}
|
_, existingJobOK := jobs[linkToScrape]
|
||||||
|
if !existingResultOK && !existingJobOK {
|
||||||
// ok, time to really look at that url
|
jobs[linkToScrape] = false
|
||||||
linkU, errParseLinkU := normalizeLink(baseURL, linkURL)
|
|
||||||
if errParseLinkU == nil {
|
|
||||||
|
|
||||||
// to be ignored ?!
|
|
||||||
ignoreLink := false
|
|
||||||
|
|
||||||
if len(linkU.Query()) > 0 {
|
|
||||||
// it has a query
|
|
||||||
if ignoreAllQueries {
|
|
||||||
// no queries in general
|
|
||||||
ignoreLink = true
|
|
||||||
} else {
|
|
||||||
// do we filter a query parameter
|
|
||||||
IgnoreLoop:
|
|
||||||
for _, ignoreP := range ignoreQueriesWith {
|
|
||||||
for pName := range linkU.Query() {
|
|
||||||
if pName == ignoreP {
|
|
||||||
ignoreLink = true
|
|
||||||
break IgnoreLoop
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !ignoreLink {
|
|
||||||
foundPath := false
|
|
||||||
for _, p := range paths {
|
|
||||||
if strings.HasPrefix(linkU.Path, p) {
|
|
||||||
foundPath = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !foundPath {
|
|
||||||
// not in the scrape path
|
|
||||||
ignoreLink = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !ignoreLink && depth > 0 {
|
|
||||||
// too deep?
|
|
||||||
linkDepth := len(strings.Split(linkU.Path, "/")) - 1
|
|
||||||
ignoreLink = linkDepth > depth
|
|
||||||
if ignoreLink {
|
|
||||||
fmt.Println("ignoring", linkU.Path, depth, linkDepth)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// ignore prefix
|
|
||||||
if !ignoreLink {
|
|
||||||
for _, ignorePrefix := range ignore {
|
|
||||||
if strings.HasPrefix(linkU.Path, ignorePrefix) {
|
|
||||||
ignoreLink = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !ignoreLink && linkU.Host == baseURL.Host &&
|
|
||||||
linkU.Scheme == baseURL.Scheme {
|
|
||||||
scanResult.Links[linkU.String()] = scanResult.Links[linkURL]
|
|
||||||
linkURL = linkU.String()
|
|
||||||
_, existingResultOK := results[linkURL]
|
|
||||||
_, existingJobOK := jobs[linkURL]
|
|
||||||
if !existingResultOK && !existingJobOK {
|
|
||||||
jobs[linkURL] = false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func normalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, err error) {
|
|
||||||
// let us ditch anchors
|
|
||||||
anchorParts := strings.Split(linkURL, "#")
|
|
||||||
linkURL = anchorParts[0]
|
|
||||||
link, errParseLink := url.Parse(linkURL)
|
|
||||||
if errParseLink != nil {
|
|
||||||
err = errParseLink
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// host
|
|
||||||
if link.Host == "" {
|
|
||||||
link.Host = baseURL.Host
|
|
||||||
}
|
|
||||||
// scheme
|
|
||||||
if link.Scheme == "" || link.Scheme == "//" {
|
|
||||||
link.Scheme = baseURL.Scheme
|
|
||||||
}
|
|
||||||
if baseURL.User != nil {
|
|
||||||
link.User = baseURL.User
|
|
||||||
}
|
|
||||||
// it is beautiful now
|
|
||||||
normalizedLink = link
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|||||||
@ -15,8 +15,11 @@ type Service struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewService(conf *config.Config) (s *Service, err error) {
|
func NewService(conf *config.Config) (s *Service, err error) {
|
||||||
w := NewWalker(conf.Concurrency, conf.UseCookies)
|
w := NewWalker()
|
||||||
w.walk(conf)
|
errWalk := w.walk(conf)
|
||||||
|
if errWalk != nil {
|
||||||
|
return nil, errWalk
|
||||||
|
}
|
||||||
s = &Service{
|
s = &Service{
|
||||||
Walker: w,
|
Walker: w,
|
||||||
// targetURL: conf.Target,
|
// targetURL: conf.Target,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user