added parents/children to filter model, fixed scraping (+tests)

This commit is contained in:
BroodjeAap 2022-09-25 15:52:53 +00:00
parent 499bb09125
commit c701063fb4
3 changed files with 147 additions and 133 deletions

View file

@ -7,16 +7,18 @@ type Watch struct {
} }
type Filter struct { type Filter struct {
ID uint `form:"filter_id" yaml:"filter_id" json:"filter_id"` ID uint `form:"filter_id" yaml:"filter_id" json:"filter_id"`
WatchID uint `form:"filter_watch_id" yaml:"filter_watch_id" json:"filter_watch_id" binding:"required"` WatchID uint `form:"filter_watch_id" yaml:"filter_watch_id" json:"filter_watch_id" binding:"required"`
Name string `form:"filter_name" yaml:"filter_name" json:"filter_name" binding:"required" validate:"min=1"` Name string `form:"filter_name" yaml:"filter_name" json:"filter_name" binding:"required" validate:"min=1"`
X int `form:"x" yaml:"x" json:"x" validate:"default=0"` X int `form:"x" yaml:"x" json:"x" validate:"default=0"`
Y int `form:"y" yaml:"y" json:"y" validate:"default=0"` Y int `form:"y" yaml:"y" json:"y" validate:"default=0"`
Type string `form:"filter_type" yaml:"filter_type" json:"filter_type" binding:"required" validate:"oneof=url xpath json css replace match substring"` Type string `form:"filter_type" yaml:"filter_type" json:"filter_type" binding:"required" validate:"oneof=url xpath json css replace match substring"`
Var1 string `form:"var1" yaml:"var1" json:"var1" binding:"required"` Var1 string `form:"var1" yaml:"var1" json:"var1" binding:"required"`
Var2 string `form:"var2" yaml:"var2" json:"var2"` Var2 *string `form:"var2" yaml:"var2" json:"var2"`
Var3 string `form:"var3" yaml:"var3" json:"var3"` Var3 *string `form:"var3" yaml:"var3" json:"var3"`
Results []string `gorm:"-:all"` Parents []*Filter `gorm:"-:all"`
Children []*Filter `gorm:"-:all"`
Results []string `gorm:"-:all"`
} }
type FilterConnection struct { type FilterConnection struct {

View file

@ -1,12 +1,17 @@
package main package main
/* import (
func getFilterResults(filter *Filter) { "bytes"
getFilterResult(filter) "log"
for _, filter := range filter.Filters { "regexp"
getFilterResults(&filter) "strconv"
} "strings"
}
"github.com/andybalholm/cascadia"
"github.com/antchfx/htmlquery"
"github.com/tidwall/gjson"
"golang.org/x/net/html"
)
func getFilterResult(filter *Filter) { func getFilterResult(filter *Filter) {
switch { switch {
@ -40,83 +45,91 @@ func getFilterResult(filter *Filter) {
} }
func getFilterResultXPath(filter *Filter) { func getFilterResultXPath(filter *Filter) {
if filter.Parent == nil { if filter.Parents == nil {
log.Println("Filter", filter.Name, "called without parent for", filter.Type) log.Println("Filter", filter.Name, "called without parents for", filter.Type)
return return
} }
for _, result := range filter.Parent.Results { for _, parent := range filter.Parents {
doc, err := htmlquery.Parse(strings.NewReader(result)) for _, result := range parent.Results {
if err != nil { doc, err := htmlquery.Parse(strings.NewReader(result))
log.Print(err) if err != nil {
continue log.Print(err)
} continue
nodes, _ := htmlquery.QueryAll(doc, filter.Var1) }
for _, node := range nodes { nodes, _ := htmlquery.QueryAll(doc, filter.Var1)
var b bytes.Buffer for _, node := range nodes {
html.Render(&b, node) var b bytes.Buffer
filter.Results = append(filter.Results, html.UnescapeString(b.String())) html.Render(&b, node)
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
}
} }
} }
} }
func getFilterResultJSON(filter *Filter) { func getFilterResultJSON(filter *Filter) {
if filter.Parent == nil { if filter.Parents == nil {
log.Println("Filter", filter.Name, "called without parent for", filter.Type) log.Println("Filter", filter.Name, "called without parent for", filter.Type)
return return
} }
for _, result := range filter.Parent.Results { for _, parent := range filter.Parents {
for _, match := range gjson.Get(result, filter.Var1).Array() { for _, result := range parent.Results {
filter.Results = append(filter.Results, match.String()) for _, match := range gjson.Get(result, filter.Var1).Array() {
filter.Results = append(filter.Results, match.String())
}
} }
} }
} }
func getFilterResultCSS(filter *Filter) { func getFilterResultCSS(filter *Filter) {
if filter.Parent == nil { if filter.Parents == nil {
log.Println("Filter", filter.Name, "called without parent for", filter.Type) log.Println("Filter", filter.Name, "called without parent for", filter.Type)
return return
} }
for _, result := range filter.Parent.Results { for _, parent := range filter.Parents {
doc, err := html.Parse(strings.NewReader(result)) for _, result := range parent.Results {
if err != nil { doc, err := html.Parse(strings.NewReader(result))
log.Print(err) if err != nil {
continue log.Print(err)
} continue
sel, err := cascadia.Parse(filter.Var1) }
if err != nil { sel, err := cascadia.Parse(filter.Var1)
log.Print(err) if err != nil {
continue log.Print(err)
} continue
for _, node := range cascadia.QueryAll(doc, sel) { }
var b bytes.Buffer for _, node := range cascadia.QueryAll(doc, sel) {
html.Render(&b, node) var b bytes.Buffer
log.Println(b.String()) html.Render(&b, node)
filter.Results = append(filter.Results, html.UnescapeString(b.String())) log.Println(b.String())
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
}
} }
} }
} }
func getFilterResultReplace(filter *Filter) { func getFilterResultReplace(filter *Filter) {
if filter.Parent == nil { if filter.Parents == nil {
log.Println("Filter", filter.Name, "called without parent for", filter.Type) log.Println("Filter", filter.Name, "called without parent for", filter.Type)
return return
} }
for _, result := range filter.Parent.Results { for _, parent := range filter.Parents {
r, err := regexp.Compile(filter.Var1) for _, result := range parent.Results {
if err != nil { r, err := regexp.Compile(filter.Var1)
log.Print(err) if err != nil {
continue log.Print(err)
} continue
if filter.Var2 == nil { }
filter.Results = append(filter.Results, r.ReplaceAllString(result, "")) if filter.Var2 == nil {
} else { filter.Results = append(filter.Results, r.ReplaceAllString(result, ""))
filter.Results = append(filter.Results, r.ReplaceAllString(result, *filter.Var2)) } else {
filter.Results = append(filter.Results, r.ReplaceAllString(result, *filter.Var2))
}
} }
} }
} }
func getFilterResultMatch(filter *Filter) { func getFilterResultMatch(filter *Filter) {
if filter.Parent == nil { if filter.Parents == nil {
log.Println("Filter", filter.Name, "called without parent for", filter.Type) log.Println("Filter", filter.Name, "called without parent for", filter.Type)
return return
} }
@ -125,75 +138,74 @@ func getFilterResultMatch(filter *Filter) {
log.Print(err) log.Print(err)
return return
} }
for _, result := range filter.Parent.Results { for _, parent := range filter.Parents {
log.Println(">", result) for _, result := range parent.Results {
for _, str := range r.FindAllString(result, -1) { log.Println(">", result)
log.Println(">>", str) for _, str := range r.FindAllString(result, -1) {
filter.Results = append(filter.Results, str) log.Println(">>", str)
filter.Results = append(filter.Results, str)
}
} }
} }
} }
func getFilterResultSubstring(filter *Filter) { func getFilterResultSubstring(filter *Filter) {
if filter.Parent == nil { if filter.Parents == nil {
log.Println("Filter", filter.Name, "called without parent for", filter.Type) log.Println("Filter", filter.Name, "called without parent for", filter.Type)
return return
} }
for _, result := range filter.Parent.Results { for _, parent := range filter.Parents {
substrings := strings.Split(filter.Var1, ",") for _, result := range parent.Results {
var sb strings.Builder substrings := strings.Split(filter.Var1, ",")
asRunes := []rune(result) var sb strings.Builder
asRunes := []rune(result)
for _, substring := range substrings { for _, substring := range substrings {
if strings.Contains(substring, ":") { if strings.Contains(substring, ":") {
from_to := strings.Split(substring, ":") from_to := strings.Split(substring, ":")
if len(from_to) != 2 { if len(from_to) != 2 {
filter.Results = filter.Parent.Results return
return }
fromStr := from_to[0]
var hasFrom bool = true
if fromStr == "" {
hasFrom = false
}
from64, err := strconv.ParseInt(fromStr, 10, 32)
var from = int(from64)
if hasFrom && err != nil {
return
} else if from < 0 {
from = len(asRunes) + from
}
toStr := from_to[1]
var hasTo bool = true
if toStr == "" {
hasTo = false
}
to64, err := strconv.ParseInt(toStr, 10, 32)
var to = int(to64)
if hasTo && err != nil {
return
} else if to < 0 {
to = len(asRunes) + to
}
if hasFrom && hasTo {
sb.WriteString(string(asRunes[from:to]))
} else if hasFrom {
sb.WriteString(string(asRunes[from:]))
} else if hasTo {
sb.WriteString(string(asRunes[:to]))
}
} else {
pos, err := strconv.ParseInt(substring, 10, 32)
if err != nil || pos < 0 {
return
}
sb.WriteRune(asRunes[pos])
} }
fromStr := from_to[0]
var hasFrom bool = true
if fromStr == "" {
hasFrom = false
}
from64, err := strconv.ParseInt(fromStr, 10, 32)
var from = int(from64)
if hasFrom && err != nil {
filter.Results = filter.Parent.Results
return
} else if from < 0 {
from = len(asRunes) + from
}
toStr := from_to[1]
var hasTo bool = true
if toStr == "" {
hasTo = false
}
to64, err := strconv.ParseInt(toStr, 10, 32)
var to = int(to64)
if hasTo && err != nil {
filter.Results = filter.Parent.Results
return
} else if to < 0 {
to = len(asRunes) + to
}
if hasFrom && hasTo {
sb.WriteString(string(asRunes[from:to]))
} else if hasFrom {
sb.WriteString(string(asRunes[from:]))
} else if hasTo {
sb.WriteString(string(asRunes[:to]))
}
} else {
pos, err := strconv.ParseInt(substring, 10, 32)
if err != nil || pos < 0 {
filter.Results = filter.Parent.Results
return
}
sb.WriteRune(asRunes[pos])
} }
filter.Results = append(filter.Results, sb.String())
} }
filter.Results = append(filter.Results, sb.String())
} }
} }
*/

View file

@ -56,8 +56,8 @@ func TestFilterXPath(t *testing.T) {
testname := fmt.Sprintf("%s", test.Query) testname := fmt.Sprintf("%s", test.Query)
t.Run(testname, func(t *testing.T) { t.Run(testname, func(t *testing.T) {
filter := Filter{ filter := Filter{
Parent: &Filter{ Parents: []*Filter{
Results: []string{HTML_STRING}, {Results: []string{HTML_STRING}},
}, },
Var1: test.Query, Var1: test.Query,
} }
@ -86,8 +86,8 @@ func TestFilterJSON(t *testing.T) {
testname := fmt.Sprintf("%s", test.Query) testname := fmt.Sprintf("%s", test.Query)
t.Run(testname, func(t *testing.T) { t.Run(testname, func(t *testing.T) {
filter := Filter{ filter := Filter{
Parent: &Filter{ Parents: []*Filter{
Results: []string{JSON_STRING}, {Results: []string{JSON_STRING}},
}, },
Var1: test.Query, Var1: test.Query,
} }
@ -117,8 +117,8 @@ func TestFilterCSS(t *testing.T) {
testname := fmt.Sprintf("%s", test.Query) testname := fmt.Sprintf("%s", test.Query)
t.Run(testname, func(t *testing.T) { t.Run(testname, func(t *testing.T) {
filter := Filter{ filter := Filter{
Parent: &Filter{ Parents: []*Filter{
Results: []string{HTML_STRING}, {Results: []string{HTML_STRING}},
}, },
Var1: test.Query, Var1: test.Query,
} }
@ -154,8 +154,8 @@ func TestFilterReplace(t *testing.T) {
testname := fmt.Sprintf("%s %s", test.Input, test.Query) testname := fmt.Sprintf("%s %s", test.Input, test.Query)
t.Run(testname, func(t *testing.T) { t.Run(testname, func(t *testing.T) {
filter := Filter{ filter := Filter{
Parent: &Filter{ Parents: []*Filter{
Results: []string{test.Input}, {Results: []string{test.Input}},
}, },
Var1: test.Query, Var1: test.Query,
} }
@ -186,8 +186,8 @@ func TestFilterMatch(t *testing.T) {
testname := fmt.Sprintf("%s", test.Query) testname := fmt.Sprintf("%s", test.Query)
t.Run(testname, func(t *testing.T) { t.Run(testname, func(t *testing.T) {
filter := Filter{ filter := Filter{
Parent: &Filter{ Parents: []*Filter{
Results: []string{test.Input}, {Results: []string{test.Input}},
}, },
Var1: test.Query, Var1: test.Query,
} }
@ -237,8 +237,8 @@ func TestFilterSubstring(t *testing.T) {
testname := fmt.Sprintf("%s %s", test.Input, test.Query) testname := fmt.Sprintf("%s %s", test.Input, test.Query)
t.Run(testname, func(t *testing.T) { t.Run(testname, func(t *testing.T) {
filter := Filter{ filter := Filter{
Parent: &Filter{ Parents: []*Filter{
Results: []string{test.Input}, {Results: []string{test.Input}},
}, },
Var1: test.Query, Var1: test.Query,
} }