From c701063fb47a0f955af3668279860800e19b88d3 Mon Sep 17 00:00:00 2001 From: BroodjeAap Date: Sun, 25 Sep 2022 15:52:53 +0000 Subject: [PATCH] added parents/children to filter model, fixed scraping (+tests) --- models.go | 22 +++-- scraping.go | 234 +++++++++++++++++++++++++---------------------- scraping_test.go | 24 ++--- 3 files changed, 147 insertions(+), 133 deletions(-) diff --git a/models.go b/models.go index 5fa378c..6903194 100644 --- a/models.go +++ b/models.go @@ -7,16 +7,18 @@ type Watch struct { } type Filter struct { - ID uint `form:"filter_id" yaml:"filter_id" json:"filter_id"` - WatchID uint `form:"filter_watch_id" yaml:"filter_watch_id" json:"filter_watch_id" binding:"required"` - Name string `form:"filter_name" yaml:"filter_name" json:"filter_name" binding:"required" validate:"min=1"` - X int `form:"x" yaml:"x" json:"x" validate:"default=0"` - Y int `form:"y" yaml:"y" json:"y" validate:"default=0"` - Type string `form:"filter_type" yaml:"filter_type" json:"filter_type" binding:"required" validate:"oneof=url xpath json css replace match substring"` - Var1 string `form:"var1" yaml:"var1" json:"var1" binding:"required"` - Var2 string `form:"var2" yaml:"var2" json:"var2"` - Var3 string `form:"var3" yaml:"var3" json:"var3"` - Results []string `gorm:"-:all"` + ID uint `form:"filter_id" yaml:"filter_id" json:"filter_id"` + WatchID uint `form:"filter_watch_id" yaml:"filter_watch_id" json:"filter_watch_id" binding:"required"` + Name string `form:"filter_name" yaml:"filter_name" json:"filter_name" binding:"required" validate:"min=1"` + X int `form:"x" yaml:"x" json:"x" validate:"default=0"` + Y int `form:"y" yaml:"y" json:"y" validate:"default=0"` + Type string `form:"filter_type" yaml:"filter_type" json:"filter_type" binding:"required" validate:"oneof=url xpath json css replace match substring"` + Var1 string `form:"var1" yaml:"var1" json:"var1" binding:"required"` + Var2 *string `form:"var2" yaml:"var2" json:"var2"` + Var3 *string `form:"var3" yaml:"var3" json:"var3"` + Parents []*Filter `gorm:"-:all"` + Children []*Filter `gorm:"-:all"` + Results []string `gorm:"-:all"` } type FilterConnection struct { diff --git a/scraping.go b/scraping.go index 0193295..33d373f 100644 --- a/scraping.go +++ b/scraping.go @@ -1,12 +1,17 @@ package main -/* -func getFilterResults(filter *Filter) { - getFilterResult(filter) - for _, filter := range filter.Filters { - getFilterResults(&filter) - } -} +import ( + "bytes" + "log" + "regexp" + "strconv" + "strings" + + "github.com/andybalholm/cascadia" + "github.com/antchfx/htmlquery" + "github.com/tidwall/gjson" + "golang.org/x/net/html" +) func getFilterResult(filter *Filter) { switch { @@ -40,83 +45,91 @@ func getFilterResult(filter *Filter) { } func getFilterResultXPath(filter *Filter) { - if filter.Parent == nil { - log.Println("Filter", filter.Name, "called without parent for", filter.Type) + if filter.Parents == nil { + log.Println("Filter", filter.Name, "called without parents for", filter.Type) return } - for _, result := range filter.Parent.Results { - doc, err := htmlquery.Parse(strings.NewReader(result)) - if err != nil { - log.Print(err) - continue - } - nodes, _ := htmlquery.QueryAll(doc, filter.Var1) - for _, node := range nodes { - var b bytes.Buffer - html.Render(&b, node) - filter.Results = append(filter.Results, html.UnescapeString(b.String())) + for _, parent := range filter.Parents { + for _, result := range parent.Results { + doc, err := htmlquery.Parse(strings.NewReader(result)) + if err != nil { + log.Print(err) + continue + } + nodes, _ := htmlquery.QueryAll(doc, filter.Var1) + for _, node := range nodes { + var b bytes.Buffer + html.Render(&b, node) + filter.Results = append(filter.Results, html.UnescapeString(b.String())) + } } } } func getFilterResultJSON(filter *Filter) { - if filter.Parent == nil { + if filter.Parents == nil { log.Println("Filter", filter.Name, "called without parent for", filter.Type) return } - for _, result := range filter.Parent.Results { - for _, match := range gjson.Get(result, filter.Var1).Array() { - filter.Results = append(filter.Results, match.String()) + for _, parent := range filter.Parents { + for _, result := range parent.Results { + for _, match := range gjson.Get(result, filter.Var1).Array() { + filter.Results = append(filter.Results, match.String()) + } } } } func getFilterResultCSS(filter *Filter) { - if filter.Parent == nil { + if filter.Parents == nil { log.Println("Filter", filter.Name, "called without parent for", filter.Type) return } - for _, result := range filter.Parent.Results { - doc, err := html.Parse(strings.NewReader(result)) - if err != nil { - log.Print(err) - continue - } - sel, err := cascadia.Parse(filter.Var1) - if err != nil { - log.Print(err) - continue - } - for _, node := range cascadia.QueryAll(doc, sel) { - var b bytes.Buffer - html.Render(&b, node) - log.Println(b.String()) - filter.Results = append(filter.Results, html.UnescapeString(b.String())) + for _, parent := range filter.Parents { + for _, result := range parent.Results { + doc, err := html.Parse(strings.NewReader(result)) + if err != nil { + log.Print(err) + continue + } + sel, err := cascadia.Parse(filter.Var1) + if err != nil { + log.Print(err) + continue + } + for _, node := range cascadia.QueryAll(doc, sel) { + var b bytes.Buffer + html.Render(&b, node) + log.Println(b.String()) + filter.Results = append(filter.Results, html.UnescapeString(b.String())) + } } } } func getFilterResultReplace(filter *Filter) { - if filter.Parent == nil { + if filter.Parents == nil { log.Println("Filter", filter.Name, "called without parent for", filter.Type) return } - for _, result := range filter.Parent.Results { - r, err := regexp.Compile(filter.Var1) - if err != nil { - log.Print(err) - continue - } - if filter.Var2 == nil { - filter.Results = append(filter.Results, r.ReplaceAllString(result, "")) - } else { - filter.Results = append(filter.Results, r.ReplaceAllString(result, *filter.Var2)) + for _, parent := range filter.Parents { + for _, result := range parent.Results { + r, err := regexp.Compile(filter.Var1) + if err != nil { + log.Print(err) + continue + } + if filter.Var2 == nil { + filter.Results = append(filter.Results, r.ReplaceAllString(result, "")) + } else { + filter.Results = append(filter.Results, r.ReplaceAllString(result, *filter.Var2)) + } } } } func getFilterResultMatch(filter *Filter) { - if filter.Parent == nil { + if filter.Parents == nil { log.Println("Filter", filter.Name, "called without parent for", filter.Type) return } @@ -125,75 +138,74 @@ func getFilterResultMatch(filter *Filter) { log.Print(err) return } - for _, result := range filter.Parent.Results { - log.Println(">", result) - for _, str := range r.FindAllString(result, -1) { - log.Println(">>", str) - filter.Results = append(filter.Results, str) + for _, parent := range filter.Parents { + for _, result := range parent.Results { + log.Println(">", result) + for _, str := range r.FindAllString(result, -1) { + log.Println(">>", str) + filter.Results = append(filter.Results, str) + } } } } func getFilterResultSubstring(filter *Filter) { - if filter.Parent == nil { + if filter.Parents == nil { log.Println("Filter", filter.Name, "called without parent for", filter.Type) return } - for _, result := range filter.Parent.Results { - substrings := strings.Split(filter.Var1, ",") - var sb strings.Builder - asRunes := []rune(result) + for _, parent := range filter.Parents { + for _, result := range parent.Results { + substrings := strings.Split(filter.Var1, ",") + var sb strings.Builder + asRunes := []rune(result) - for _, substring := range substrings { - if strings.Contains(substring, ":") { - from_to := strings.Split(substring, ":") - if len(from_to) != 2 { - filter.Results = filter.Parent.Results - return + for _, substring := range substrings { + if strings.Contains(substring, ":") { + from_to := strings.Split(substring, ":") + if len(from_to) != 2 { + return + } + fromStr := from_to[0] + var hasFrom bool = true + if fromStr == "" { + hasFrom = false + } + from64, err := strconv.ParseInt(fromStr, 10, 32) + var from = int(from64) + if hasFrom && err != nil { + return + } else if from < 0 { + from = len(asRunes) + from + } + toStr := from_to[1] + var hasTo bool = true + if toStr == "" { + hasTo = false + } + to64, err := strconv.ParseInt(toStr, 10, 32) + var to = int(to64) + if hasTo && err != nil { + return + } else if to < 0 { + to = len(asRunes) + to + } + if hasFrom && hasTo { + sb.WriteString(string(asRunes[from:to])) + } else if hasFrom { + sb.WriteString(string(asRunes[from:])) + } else if hasTo { + sb.WriteString(string(asRunes[:to])) + } + } else { + pos, err := strconv.ParseInt(substring, 10, 32) + if err != nil || pos < 0 { + return + } + sb.WriteRune(asRunes[pos]) } - fromStr := from_to[0] - var hasFrom bool = true - if fromStr == "" { - hasFrom = false - } - from64, err := strconv.ParseInt(fromStr, 10, 32) - var from = int(from64) - if hasFrom && err != nil { - filter.Results = filter.Parent.Results - return - } else if from < 0 { - from = len(asRunes) + from - } - toStr := from_to[1] - var hasTo bool = true - if toStr == "" { - hasTo = false - } - to64, err := strconv.ParseInt(toStr, 10, 32) - var to = int(to64) - if hasTo && err != nil { - filter.Results = filter.Parent.Results - return - } else if to < 0 { - to = len(asRunes) + to - } - if hasFrom && hasTo { - sb.WriteString(string(asRunes[from:to])) - } else if hasFrom { - sb.WriteString(string(asRunes[from:])) - } else if hasTo { - sb.WriteString(string(asRunes[:to])) - } - } else { - pos, err := strconv.ParseInt(substring, 10, 32) - if err != nil || pos < 0 { - filter.Results = filter.Parent.Results - return - } - sb.WriteRune(asRunes[pos]) } + filter.Results = append(filter.Results, sb.String()) } - filter.Results = append(filter.Results, sb.String()) } } -*/ diff --git a/scraping_test.go b/scraping_test.go index dbb8096..bcf4f4f 100644 --- a/scraping_test.go +++ b/scraping_test.go @@ -56,8 +56,8 @@ func TestFilterXPath(t *testing.T) { testname := fmt.Sprintf("%s", test.Query) t.Run(testname, func(t *testing.T) { filter := Filter{ - Parent: &Filter{ - Results: []string{HTML_STRING}, + Parents: []*Filter{ + {Results: []string{HTML_STRING}}, }, Var1: test.Query, } @@ -86,8 +86,8 @@ func TestFilterJSON(t *testing.T) { testname := fmt.Sprintf("%s", test.Query) t.Run(testname, func(t *testing.T) { filter := Filter{ - Parent: &Filter{ - Results: []string{JSON_STRING}, + Parents: []*Filter{ + {Results: []string{JSON_STRING}}, }, Var1: test.Query, } @@ -117,8 +117,8 @@ func TestFilterCSS(t *testing.T) { testname := fmt.Sprintf("%s", test.Query) t.Run(testname, func(t *testing.T) { filter := Filter{ - Parent: &Filter{ - Results: []string{HTML_STRING}, + Parents: []*Filter{ + {Results: []string{HTML_STRING}}, }, Var1: test.Query, } @@ -154,8 +154,8 @@ func TestFilterReplace(t *testing.T) { testname := fmt.Sprintf("%s %s", test.Input, test.Query) t.Run(testname, func(t *testing.T) { filter := Filter{ - Parent: &Filter{ - Results: []string{test.Input}, + Parents: []*Filter{ + {Results: []string{test.Input}}, }, Var1: test.Query, } @@ -186,8 +186,8 @@ func TestFilterMatch(t *testing.T) { testname := fmt.Sprintf("%s", test.Query) t.Run(testname, func(t *testing.T) { filter := Filter{ - Parent: &Filter{ - Results: []string{test.Input}, + Parents: []*Filter{ + {Results: []string{test.Input}}, }, Var1: test.Query, } @@ -237,8 +237,8 @@ func TestFilterSubstring(t *testing.T) { testname := fmt.Sprintf("%s %s", test.Input, test.Query) t.Run(testname, func(t *testing.T) { filter := Filter{ - Parent: &Filter{ - Results: []string{test.Input}, + Parents: []*Filter{ + {Results: []string{test.Input}}, }, Var1: test.Query, }