mirror of
https://github.com/foomo/walker.git
synced 2025-10-16 12:45:39 +00:00
124 lines
3.3 KiB
Go
124 lines
3.3 KiB
Go
package walker
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"sort"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/foomo/walker/config"
|
|
"github.com/foomo/walker/htmlschema"
|
|
"github.com/foomo/walker/reports"
|
|
"github.com/foomo/walker/vo"
|
|
)
|
|
|
|
type start struct {
|
|
conf config.Config
|
|
groupValidator *htmlschema.GroupValidator
|
|
linkListFilterFunc LinkListFilterFunc
|
|
validationFunc ValidationFunc
|
|
scrapeFunc ScrapeFunc
|
|
scrapeResultModifierFunc ScrapeResultModifierFunc
|
|
}
|
|
|
|
type started struct {
|
|
Err error
|
|
ChanLoopComplete chan vo.Status
|
|
}
|
|
|
|
type sortLenStrings []string
|
|
|
|
func (p sortLenStrings) Len() int { return len(p) }
|
|
func (p sortLenStrings) Less(i, j int) bool { return len(p[i]) > len(p[j]) }
|
|
func (p sortLenStrings) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
|
|
|
|
func sortPathsByLength(paths []string) []string {
|
|
sls := make(sortLenStrings, len(paths))
|
|
copy(sls, paths)
|
|
sort.Sort(sls)
|
|
return []string(sls)
|
|
}
|
|
|
|
type LinkListFilterFunc func(baseURL, docURL *url.URL, doc *goquery.Document) (ll vo.LinkList, err error)
|
|
type ScrapeFunc func(response *http.Response) (scarepeData interface{}, err error)
|
|
type ScrapeResultModifierFunc func(result vo.ScrapeResult) (modifiedResult vo.ScrapeResult, err error)
|
|
type ValidationFunc func(structure vo.Structure, scrapeData interface{}) (vo.Validations, error)
|
|
|
|
type Walker struct {
|
|
chanResult chan scrapeResultAndClient
|
|
chanStart chan start
|
|
chanStatus chan vo.Status
|
|
chanStop chan vo.Status
|
|
chanStarted chan started
|
|
CompleteStatus *vo.Status
|
|
}
|
|
|
|
func NewWalker() *Walker {
|
|
w := &Walker{
|
|
chanResult: make(chan scrapeResultAndClient),
|
|
chanStart: make(chan start),
|
|
chanStop: make(chan vo.Status),
|
|
chanStatus: make(chan vo.Status),
|
|
chanStarted: make(chan started),
|
|
}
|
|
go w.scrapeloop()
|
|
return w
|
|
}
|
|
|
|
func (w *Walker) Walk(
|
|
conf *config.Config,
|
|
linkListFilter LinkListFilterFunc,
|
|
scrapeFunc ScrapeFunc,
|
|
validationFunc ValidationFunc,
|
|
scrapeResultModifierFunc ScrapeResultModifierFunc,
|
|
) (chanLoopStatus chan vo.Status, err error) {
|
|
var groupValidator *htmlschema.GroupValidator
|
|
if conf.SchemaRoot != "" {
|
|
gv, errGroupValidator := htmlschema.NewGroupValidator(conf.SchemaRoot)
|
|
if errGroupValidator != nil {
|
|
return nil, errGroupValidator
|
|
}
|
|
groupValidator = gv
|
|
}
|
|
w.chanStart <- start{
|
|
groupValidator: groupValidator,
|
|
conf: *conf,
|
|
scrapeFunc: scrapeFunc,
|
|
linkListFilterFunc: linkListFilter,
|
|
validationFunc: validationFunc,
|
|
scrapeResultModifierFunc: scrapeResultModifierFunc,
|
|
}
|
|
st := <-w.chanStarted
|
|
return st.ChanLoopComplete, st.Err
|
|
}
|
|
|
|
func (w *Walker) Stop() vo.Status {
|
|
w.chanStop <- vo.Status{}
|
|
return <-w.chanStop
|
|
}
|
|
|
|
func (w *Walker) GetStatus() vo.Status {
|
|
w.chanStatus <- vo.Status{}
|
|
return <-w.chanStatus
|
|
}
|
|
|
|
func line(w io.Writer) {
|
|
fmt.Fprintln(w, "------------------------------------------------------------------------")
|
|
}
|
|
|
|
func headline(w io.Writer, v ...interface{}) {
|
|
fmt.Fprintln(w)
|
|
fmt.Fprintln(w, v...)
|
|
line(w)
|
|
}
|
|
|
|
func (wlkr *Walker) GetReportHandler(basePath string) http.HandlerFunc {
|
|
h := reports.GetReportHandler(basePath)
|
|
return func(w http.ResponseWriter, r *http.Request) {
|
|
runningStatus := wlkr.GetStatus()
|
|
h(w, r, wlkr.CompleteStatus, &runningStatus)
|
|
}
|
|
}
|