added ScrapeResultModifierFunc

This commit is contained in:
Jan Halfar 2020-06-23 17:05:00 +02:00
parent 4c5d090358
commit e2aeca122d
5 changed files with 49 additions and 20 deletions

View File

@ -3,18 +3,24 @@ package walker
import (
"encoding/json"
"fmt"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/foomo/walker/vo"
)
func extractTrimText(txt string) string {
return strings.Trim(txt, " \n")
}
// ExtractStructure extract the most important semantic elements out of a html document
func ExtractStructure(doc *goquery.Document) (s vo.Structure, err error) {
description, _ := doc.Find("meta[name=description]").First().Attr("content")
robots, _ := doc.Find("meta[name=robots]").First().Attr("content")
s = vo.Structure{
Title: doc.Find("title").First().Text(),
Description: description,
Robots: robots,
Title: extractTrimText(doc.Find("title").First().Text()),
Description: extractTrimText(description),
Robots: extractTrimText(robots),
}
doc.Find("link[rel=prev], link[rel=next], link[rel=canonical]").Each(func(i int, sel *goquery.Selection) {
attrRelVal, attrRelOK := sel.Attr("rel")
@ -22,11 +28,11 @@ func ExtractStructure(doc *goquery.Document) (s vo.Structure, err error) {
if attrRelOK && attrHrefOK {
switch attrRelVal {
case "canonical":
s.Canonical = attrHref
s.Canonical = extractTrimText(attrHref)
case "prev":
s.LinkPrev = attrHref
s.LinkPrev = extractTrimText(attrHref)
case "next":
s.LinkNext = attrHref
s.LinkNext = extractTrimText(attrHref)
}
}
})
@ -57,7 +63,7 @@ func ExtractStructure(doc *goquery.Document) (s vo.Structure, err error) {
}
s.Headings = append(s.Headings, vo.Heading{
Level: level,
Text: sel.Text(),
Text: extractTrimText(sel.Text()),
})
})
return

View File

@ -43,6 +43,7 @@ func reportSEO(status vo.Status, w io.Writer, filter scrapeResultFilter) {
descriptions := duplications{}
missingTitles := uniqueList{}
missingH1 := uniqueList{}
emptyH1 := uniqueList{}
missingDescriptions := uniqueList{}
printh("SEO duplications")
for _, r := range status.Results {
@ -50,12 +51,14 @@ func reportSEO(status vo.Status, w io.Writer, filter scrapeResultFilter) {
continue
}
if r.Code != http.StatusOK {
// println("skipping with code", r.Code, r.TargetURL)
continue
}
finalURL := getFinalURLForScrapeResult(r)
normalizedCanonical := normalizeCanonical(r.TargetURL, r.Structure.Canonical)
if normalizedCanonical != finalURL {
// we are skipping this one
// println("skipping normalizedCanonical != finalURL", normalizedCanonical, "!=", finalURL)
continue
}
if strings.Contains(r.ContentType, "html") {
@ -71,8 +74,12 @@ func reportSEO(status vo.Status, w io.Writer, filter scrapeResultFilter) {
} else {
titles.add(r.Structure.Title, finalURL)
}
if heading.Level == 1 && heading.Text != "" {
h1s.add(heading.Text, finalURL)
if heading.Level == 1 {
if heading.Text != "" {
h1s.add(heading.Text, finalURL)
} else {
emptyH1.add(finalURL)
}
foundH1 = true
}
}
@ -105,4 +112,5 @@ func reportSEO(status vo.Status, w io.Writer, filter scrapeResultFilter) {
printList("missing titles", missingTitles)
printList("missing descriptions", missingDescriptions)
printList("missing h1", missingH1)
printList("empty h1", emptyH1)
}

View File

@ -105,6 +105,7 @@ func (w *Walker) scrapeloop() {
var scrapeFunc ScrapeFunc
var validationFunc ValidationFunc
var linkListFilterFunc LinkListFilterFunc
var scrapeResultModifierFunc ScrapeResultModifierFunc
ll := linkLimitations{}
var jobs map[string]bool
var results map[string]vo.ScrapeResult
@ -252,6 +253,7 @@ func (w *Walker) scrapeloop() {
ignoreRobots = st.conf.IgnoreRobots
ll.ignoreQueriesWith = st.conf.IgnoreQueriesWith
ll.ignoreAllQueries = st.conf.IgnoreAllQueries
scrapeResultModifierFunc = st.scrapeResultModifierFunc
if cp == nil || cp.agent != st.conf.Agent || cp.concurrency != st.conf.Concurrency || cp.useCookies != st.conf.UseCookies {
cp = newClientPool(st.conf.Concurrency, st.conf.Agent, st.conf.UseCookies)
@ -301,6 +303,14 @@ func (w *Walker) scrapeloop() {
case scanResult := <-w.chanResult:
running--
delete(jobs, scanResult.result.TargetURL)
if scrapeResultModifierFunc != nil {
modifiedScrapeResult, errModify := scrapeResultModifierFunc(scanResult.result)
if errModify == nil {
scanResult.result = modifiedScrapeResult
} else {
fmt.Println("cound not modify scrape result", errModify)
}
}
scanResult.poolClient.busy = false
scanResult.result.Time = time.Now()
statusCodeAsString := strconv.Itoa(scanResult.result.Code)

View File

@ -20,9 +20,10 @@ func NewService(
linkListFilter LinkListFilterFunc,
scrapeFunc ScrapeFunc,
validationFunc ValidationFunc,
scrapeResultModifierFunc ScrapeResultModifierFunc,
) (s *Service, chanLoopComplete chan vo.Status, err error) {
w := NewWalker()
chanLoopComplete, errWalk := w.Walk(conf, linkListFilter, scrapeFunc, validationFunc)
chanLoopComplete, errWalk := w.Walk(conf, linkListFilter, scrapeFunc, validationFunc, scrapeResultModifierFunc)
if errWalk != nil {
return nil, nil, errWalk
}

View File

@ -15,11 +15,12 @@ import (
)
type start struct {
conf config.Config
groupValidator *htmlschema.GroupValidator
linkListFilterFunc LinkListFilterFunc
validationFunc ValidationFunc
scrapeFunc ScrapeFunc
conf config.Config
groupValidator *htmlschema.GroupValidator
linkListFilterFunc LinkListFilterFunc
validationFunc ValidationFunc
scrapeFunc ScrapeFunc
scrapeResultModifierFunc ScrapeResultModifierFunc
}
type started struct {
@ -44,6 +45,7 @@ func sortPathsByLength(paths []string) []string {
type LinkListFilterFunc func(baseURL, docURL *url.URL, doc *goquery.Document) (ll vo.LinkList, err error)
type ScrapeFunc func(response *http.Response) (scarepeData interface{}, err error)
type ScrapeResultModifierFunc func(result vo.ScrapeResult) (modifiedResult vo.ScrapeResult, err error)
type ValidationFunc func(structure vo.Structure, scrapeData interface{}) (vo.Validations, error)
type Walker struct {
@ -72,6 +74,7 @@ func (w *Walker) Walk(
linkListFilter LinkListFilterFunc,
scrapeFunc ScrapeFunc,
validationFunc ValidationFunc,
scrapeResultModifierFunc ScrapeResultModifierFunc,
) (chanLoopStatus chan vo.Status, err error) {
var groupValidator *htmlschema.GroupValidator
if conf.SchemaRoot != "" {
@ -82,11 +85,12 @@ func (w *Walker) Walk(
groupValidator = gv
}
w.chanStart <- start{
groupValidator: groupValidator,
conf: *conf,
scrapeFunc: scrapeFunc,
linkListFilterFunc: linkListFilter,
validationFunc: validationFunc,
groupValidator: groupValidator,
conf: *conf,
scrapeFunc: scrapeFunc,
linkListFilterFunc: linkListFilter,
validationFunc: validationFunc,
scrapeResultModifierFunc: scrapeResultModifierFunc,
}
st := <-w.chanStarted
return st.ChanLoopComplete, st.Err