mirror of
https://github.com/foomo/walker.git
synced 2025-10-16 12:45:39 +00:00
added ScrapeResultModifierFunc
This commit is contained in:
parent
4c5d090358
commit
e2aeca122d
20
extract.go
20
extract.go
@ -3,18 +3,24 @@ package walker
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/foomo/walker/vo"
|
||||
)
|
||||
|
||||
func extractTrimText(txt string) string {
|
||||
return strings.Trim(txt, " \n")
|
||||
}
|
||||
|
||||
// ExtractStructure extract the most important semantic elements out of a html document
|
||||
func ExtractStructure(doc *goquery.Document) (s vo.Structure, err error) {
|
||||
description, _ := doc.Find("meta[name=description]").First().Attr("content")
|
||||
robots, _ := doc.Find("meta[name=robots]").First().Attr("content")
|
||||
s = vo.Structure{
|
||||
Title: doc.Find("title").First().Text(),
|
||||
Description: description,
|
||||
Robots: robots,
|
||||
Title: extractTrimText(doc.Find("title").First().Text()),
|
||||
Description: extractTrimText(description),
|
||||
Robots: extractTrimText(robots),
|
||||
}
|
||||
doc.Find("link[rel=prev], link[rel=next], link[rel=canonical]").Each(func(i int, sel *goquery.Selection) {
|
||||
attrRelVal, attrRelOK := sel.Attr("rel")
|
||||
@ -22,11 +28,11 @@ func ExtractStructure(doc *goquery.Document) (s vo.Structure, err error) {
|
||||
if attrRelOK && attrHrefOK {
|
||||
switch attrRelVal {
|
||||
case "canonical":
|
||||
s.Canonical = attrHref
|
||||
s.Canonical = extractTrimText(attrHref)
|
||||
case "prev":
|
||||
s.LinkPrev = attrHref
|
||||
s.LinkPrev = extractTrimText(attrHref)
|
||||
case "next":
|
||||
s.LinkNext = attrHref
|
||||
s.LinkNext = extractTrimText(attrHref)
|
||||
}
|
||||
}
|
||||
})
|
||||
@ -57,7 +63,7 @@ func ExtractStructure(doc *goquery.Document) (s vo.Structure, err error) {
|
||||
}
|
||||
s.Headings = append(s.Headings, vo.Heading{
|
||||
Level: level,
|
||||
Text: sel.Text(),
|
||||
Text: extractTrimText(sel.Text()),
|
||||
})
|
||||
})
|
||||
return
|
||||
|
||||
@ -43,6 +43,7 @@ func reportSEO(status vo.Status, w io.Writer, filter scrapeResultFilter) {
|
||||
descriptions := duplications{}
|
||||
missingTitles := uniqueList{}
|
||||
missingH1 := uniqueList{}
|
||||
emptyH1 := uniqueList{}
|
||||
missingDescriptions := uniqueList{}
|
||||
printh("SEO duplications")
|
||||
for _, r := range status.Results {
|
||||
@ -50,12 +51,14 @@ func reportSEO(status vo.Status, w io.Writer, filter scrapeResultFilter) {
|
||||
continue
|
||||
}
|
||||
if r.Code != http.StatusOK {
|
||||
// println("skipping with code", r.Code, r.TargetURL)
|
||||
continue
|
||||
}
|
||||
finalURL := getFinalURLForScrapeResult(r)
|
||||
normalizedCanonical := normalizeCanonical(r.TargetURL, r.Structure.Canonical)
|
||||
if normalizedCanonical != finalURL {
|
||||
// we are skipping this one
|
||||
// println("skipping normalizedCanonical != finalURL", normalizedCanonical, "!=", finalURL)
|
||||
continue
|
||||
}
|
||||
if strings.Contains(r.ContentType, "html") {
|
||||
@ -71,8 +74,12 @@ func reportSEO(status vo.Status, w io.Writer, filter scrapeResultFilter) {
|
||||
} else {
|
||||
titles.add(r.Structure.Title, finalURL)
|
||||
}
|
||||
if heading.Level == 1 && heading.Text != "" {
|
||||
h1s.add(heading.Text, finalURL)
|
||||
if heading.Level == 1 {
|
||||
if heading.Text != "" {
|
||||
h1s.add(heading.Text, finalURL)
|
||||
} else {
|
||||
emptyH1.add(finalURL)
|
||||
}
|
||||
foundH1 = true
|
||||
}
|
||||
}
|
||||
@ -105,4 +112,5 @@ func reportSEO(status vo.Status, w io.Writer, filter scrapeResultFilter) {
|
||||
printList("missing titles", missingTitles)
|
||||
printList("missing descriptions", missingDescriptions)
|
||||
printList("missing h1", missingH1)
|
||||
printList("empty h1", emptyH1)
|
||||
}
|
||||
|
||||
@ -105,6 +105,7 @@ func (w *Walker) scrapeloop() {
|
||||
var scrapeFunc ScrapeFunc
|
||||
var validationFunc ValidationFunc
|
||||
var linkListFilterFunc LinkListFilterFunc
|
||||
var scrapeResultModifierFunc ScrapeResultModifierFunc
|
||||
ll := linkLimitations{}
|
||||
var jobs map[string]bool
|
||||
var results map[string]vo.ScrapeResult
|
||||
@ -252,6 +253,7 @@ func (w *Walker) scrapeloop() {
|
||||
ignoreRobots = st.conf.IgnoreRobots
|
||||
ll.ignoreQueriesWith = st.conf.IgnoreQueriesWith
|
||||
ll.ignoreAllQueries = st.conf.IgnoreAllQueries
|
||||
scrapeResultModifierFunc = st.scrapeResultModifierFunc
|
||||
|
||||
if cp == nil || cp.agent != st.conf.Agent || cp.concurrency != st.conf.Concurrency || cp.useCookies != st.conf.UseCookies {
|
||||
cp = newClientPool(st.conf.Concurrency, st.conf.Agent, st.conf.UseCookies)
|
||||
@ -301,6 +303,14 @@ func (w *Walker) scrapeloop() {
|
||||
case scanResult := <-w.chanResult:
|
||||
running--
|
||||
delete(jobs, scanResult.result.TargetURL)
|
||||
if scrapeResultModifierFunc != nil {
|
||||
modifiedScrapeResult, errModify := scrapeResultModifierFunc(scanResult.result)
|
||||
if errModify == nil {
|
||||
scanResult.result = modifiedScrapeResult
|
||||
} else {
|
||||
fmt.Println("cound not modify scrape result", errModify)
|
||||
}
|
||||
}
|
||||
scanResult.poolClient.busy = false
|
||||
scanResult.result.Time = time.Now()
|
||||
statusCodeAsString := strconv.Itoa(scanResult.result.Code)
|
||||
|
||||
@ -20,9 +20,10 @@ func NewService(
|
||||
linkListFilter LinkListFilterFunc,
|
||||
scrapeFunc ScrapeFunc,
|
||||
validationFunc ValidationFunc,
|
||||
scrapeResultModifierFunc ScrapeResultModifierFunc,
|
||||
) (s *Service, chanLoopComplete chan vo.Status, err error) {
|
||||
w := NewWalker()
|
||||
chanLoopComplete, errWalk := w.Walk(conf, linkListFilter, scrapeFunc, validationFunc)
|
||||
chanLoopComplete, errWalk := w.Walk(conf, linkListFilter, scrapeFunc, validationFunc, scrapeResultModifierFunc)
|
||||
if errWalk != nil {
|
||||
return nil, nil, errWalk
|
||||
}
|
||||
|
||||
24
walker.go
24
walker.go
@ -15,11 +15,12 @@ import (
|
||||
)
|
||||
|
||||
type start struct {
|
||||
conf config.Config
|
||||
groupValidator *htmlschema.GroupValidator
|
||||
linkListFilterFunc LinkListFilterFunc
|
||||
validationFunc ValidationFunc
|
||||
scrapeFunc ScrapeFunc
|
||||
conf config.Config
|
||||
groupValidator *htmlschema.GroupValidator
|
||||
linkListFilterFunc LinkListFilterFunc
|
||||
validationFunc ValidationFunc
|
||||
scrapeFunc ScrapeFunc
|
||||
scrapeResultModifierFunc ScrapeResultModifierFunc
|
||||
}
|
||||
|
||||
type started struct {
|
||||
@ -44,6 +45,7 @@ func sortPathsByLength(paths []string) []string {
|
||||
|
||||
type LinkListFilterFunc func(baseURL, docURL *url.URL, doc *goquery.Document) (ll vo.LinkList, err error)
|
||||
type ScrapeFunc func(response *http.Response) (scarepeData interface{}, err error)
|
||||
type ScrapeResultModifierFunc func(result vo.ScrapeResult) (modifiedResult vo.ScrapeResult, err error)
|
||||
type ValidationFunc func(structure vo.Structure, scrapeData interface{}) (vo.Validations, error)
|
||||
|
||||
type Walker struct {
|
||||
@ -72,6 +74,7 @@ func (w *Walker) Walk(
|
||||
linkListFilter LinkListFilterFunc,
|
||||
scrapeFunc ScrapeFunc,
|
||||
validationFunc ValidationFunc,
|
||||
scrapeResultModifierFunc ScrapeResultModifierFunc,
|
||||
) (chanLoopStatus chan vo.Status, err error) {
|
||||
var groupValidator *htmlschema.GroupValidator
|
||||
if conf.SchemaRoot != "" {
|
||||
@ -82,11 +85,12 @@ func (w *Walker) Walk(
|
||||
groupValidator = gv
|
||||
}
|
||||
w.chanStart <- start{
|
||||
groupValidator: groupValidator,
|
||||
conf: *conf,
|
||||
scrapeFunc: scrapeFunc,
|
||||
linkListFilterFunc: linkListFilter,
|
||||
validationFunc: validationFunc,
|
||||
groupValidator: groupValidator,
|
||||
conf: *conf,
|
||||
scrapeFunc: scrapeFunc,
|
||||
linkListFilterFunc: linkListFilter,
|
||||
validationFunc: validationFunc,
|
||||
scrapeResultModifierFunc: scrapeResultModifierFunc,
|
||||
}
|
||||
st := <-w.chanStarted
|
||||
return st.ChanLoopComplete, st.Err
|
||||
|
||||
Loading…
Reference in New Issue
Block a user