added parents/children to filter model, fixed scraping (+tests)
This commit is contained in:
parent
499bb09125
commit
c701063fb4
3 changed files with 147 additions and 133 deletions
22
models.go
22
models.go
|
@ -7,16 +7,18 @@ type Watch struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
type Filter struct {
|
type Filter struct {
|
||||||
ID uint `form:"filter_id" yaml:"filter_id" json:"filter_id"`
|
ID uint `form:"filter_id" yaml:"filter_id" json:"filter_id"`
|
||||||
WatchID uint `form:"filter_watch_id" yaml:"filter_watch_id" json:"filter_watch_id" binding:"required"`
|
WatchID uint `form:"filter_watch_id" yaml:"filter_watch_id" json:"filter_watch_id" binding:"required"`
|
||||||
Name string `form:"filter_name" yaml:"filter_name" json:"filter_name" binding:"required" validate:"min=1"`
|
Name string `form:"filter_name" yaml:"filter_name" json:"filter_name" binding:"required" validate:"min=1"`
|
||||||
X int `form:"x" yaml:"x" json:"x" validate:"default=0"`
|
X int `form:"x" yaml:"x" json:"x" validate:"default=0"`
|
||||||
Y int `form:"y" yaml:"y" json:"y" validate:"default=0"`
|
Y int `form:"y" yaml:"y" json:"y" validate:"default=0"`
|
||||||
Type string `form:"filter_type" yaml:"filter_type" json:"filter_type" binding:"required" validate:"oneof=url xpath json css replace match substring"`
|
Type string `form:"filter_type" yaml:"filter_type" json:"filter_type" binding:"required" validate:"oneof=url xpath json css replace match substring"`
|
||||||
Var1 string `form:"var1" yaml:"var1" json:"var1" binding:"required"`
|
Var1 string `form:"var1" yaml:"var1" json:"var1" binding:"required"`
|
||||||
Var2 string `form:"var2" yaml:"var2" json:"var2"`
|
Var2 *string `form:"var2" yaml:"var2" json:"var2"`
|
||||||
Var3 string `form:"var3" yaml:"var3" json:"var3"`
|
Var3 *string `form:"var3" yaml:"var3" json:"var3"`
|
||||||
Results []string `gorm:"-:all"`
|
Parents []*Filter `gorm:"-:all"`
|
||||||
|
Children []*Filter `gorm:"-:all"`
|
||||||
|
Results []string `gorm:"-:all"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type FilterConnection struct {
|
type FilterConnection struct {
|
||||||
|
|
234
scraping.go
234
scraping.go
|
@ -1,12 +1,17 @@
|
||||||
package main
|
package main
|
||||||
|
|
||||||
/*
|
import (
|
||||||
func getFilterResults(filter *Filter) {
|
"bytes"
|
||||||
getFilterResult(filter)
|
"log"
|
||||||
for _, filter := range filter.Filters {
|
"regexp"
|
||||||
getFilterResults(&filter)
|
"strconv"
|
||||||
}
|
"strings"
|
||||||
}
|
|
||||||
|
"github.com/andybalholm/cascadia"
|
||||||
|
"github.com/antchfx/htmlquery"
|
||||||
|
"github.com/tidwall/gjson"
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
func getFilterResult(filter *Filter) {
|
func getFilterResult(filter *Filter) {
|
||||||
switch {
|
switch {
|
||||||
|
@ -40,83 +45,91 @@ func getFilterResult(filter *Filter) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResultXPath(filter *Filter) {
|
func getFilterResultXPath(filter *Filter) {
|
||||||
if filter.Parent == nil {
|
if filter.Parents == nil {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parents for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.Parent.Results {
|
for _, parent := range filter.Parents {
|
||||||
doc, err := htmlquery.Parse(strings.NewReader(result))
|
for _, result := range parent.Results {
|
||||||
if err != nil {
|
doc, err := htmlquery.Parse(strings.NewReader(result))
|
||||||
log.Print(err)
|
if err != nil {
|
||||||
continue
|
log.Print(err)
|
||||||
}
|
continue
|
||||||
nodes, _ := htmlquery.QueryAll(doc, filter.Var1)
|
}
|
||||||
for _, node := range nodes {
|
nodes, _ := htmlquery.QueryAll(doc, filter.Var1)
|
||||||
var b bytes.Buffer
|
for _, node := range nodes {
|
||||||
html.Render(&b, node)
|
var b bytes.Buffer
|
||||||
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
|
html.Render(&b, node)
|
||||||
|
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResultJSON(filter *Filter) {
|
func getFilterResultJSON(filter *Filter) {
|
||||||
if filter.Parent == nil {
|
if filter.Parents == nil {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.Parent.Results {
|
for _, parent := range filter.Parents {
|
||||||
for _, match := range gjson.Get(result, filter.Var1).Array() {
|
for _, result := range parent.Results {
|
||||||
filter.Results = append(filter.Results, match.String())
|
for _, match := range gjson.Get(result, filter.Var1).Array() {
|
||||||
|
filter.Results = append(filter.Results, match.String())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResultCSS(filter *Filter) {
|
func getFilterResultCSS(filter *Filter) {
|
||||||
if filter.Parent == nil {
|
if filter.Parents == nil {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.Parent.Results {
|
for _, parent := range filter.Parents {
|
||||||
doc, err := html.Parse(strings.NewReader(result))
|
for _, result := range parent.Results {
|
||||||
if err != nil {
|
doc, err := html.Parse(strings.NewReader(result))
|
||||||
log.Print(err)
|
if err != nil {
|
||||||
continue
|
log.Print(err)
|
||||||
}
|
continue
|
||||||
sel, err := cascadia.Parse(filter.Var1)
|
}
|
||||||
if err != nil {
|
sel, err := cascadia.Parse(filter.Var1)
|
||||||
log.Print(err)
|
if err != nil {
|
||||||
continue
|
log.Print(err)
|
||||||
}
|
continue
|
||||||
for _, node := range cascadia.QueryAll(doc, sel) {
|
}
|
||||||
var b bytes.Buffer
|
for _, node := range cascadia.QueryAll(doc, sel) {
|
||||||
html.Render(&b, node)
|
var b bytes.Buffer
|
||||||
log.Println(b.String())
|
html.Render(&b, node)
|
||||||
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
|
log.Println(b.String())
|
||||||
|
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResultReplace(filter *Filter) {
|
func getFilterResultReplace(filter *Filter) {
|
||||||
if filter.Parent == nil {
|
if filter.Parents == nil {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.Parent.Results {
|
for _, parent := range filter.Parents {
|
||||||
r, err := regexp.Compile(filter.Var1)
|
for _, result := range parent.Results {
|
||||||
if err != nil {
|
r, err := regexp.Compile(filter.Var1)
|
||||||
log.Print(err)
|
if err != nil {
|
||||||
continue
|
log.Print(err)
|
||||||
}
|
continue
|
||||||
if filter.Var2 == nil {
|
}
|
||||||
filter.Results = append(filter.Results, r.ReplaceAllString(result, ""))
|
if filter.Var2 == nil {
|
||||||
} else {
|
filter.Results = append(filter.Results, r.ReplaceAllString(result, ""))
|
||||||
filter.Results = append(filter.Results, r.ReplaceAllString(result, *filter.Var2))
|
} else {
|
||||||
|
filter.Results = append(filter.Results, r.ReplaceAllString(result, *filter.Var2))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResultMatch(filter *Filter) {
|
func getFilterResultMatch(filter *Filter) {
|
||||||
if filter.Parent == nil {
|
if filter.Parents == nil {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -125,75 +138,74 @@ func getFilterResultMatch(filter *Filter) {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.Parent.Results {
|
for _, parent := range filter.Parents {
|
||||||
log.Println(">", result)
|
for _, result := range parent.Results {
|
||||||
for _, str := range r.FindAllString(result, -1) {
|
log.Println(">", result)
|
||||||
log.Println(">>", str)
|
for _, str := range r.FindAllString(result, -1) {
|
||||||
filter.Results = append(filter.Results, str)
|
log.Println(">>", str)
|
||||||
|
filter.Results = append(filter.Results, str)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResultSubstring(filter *Filter) {
|
func getFilterResultSubstring(filter *Filter) {
|
||||||
if filter.Parent == nil {
|
if filter.Parents == nil {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.Parent.Results {
|
for _, parent := range filter.Parents {
|
||||||
substrings := strings.Split(filter.Var1, ",")
|
for _, result := range parent.Results {
|
||||||
var sb strings.Builder
|
substrings := strings.Split(filter.Var1, ",")
|
||||||
asRunes := []rune(result)
|
var sb strings.Builder
|
||||||
|
asRunes := []rune(result)
|
||||||
|
|
||||||
for _, substring := range substrings {
|
for _, substring := range substrings {
|
||||||
if strings.Contains(substring, ":") {
|
if strings.Contains(substring, ":") {
|
||||||
from_to := strings.Split(substring, ":")
|
from_to := strings.Split(substring, ":")
|
||||||
if len(from_to) != 2 {
|
if len(from_to) != 2 {
|
||||||
filter.Results = filter.Parent.Results
|
return
|
||||||
return
|
}
|
||||||
|
fromStr := from_to[0]
|
||||||
|
var hasFrom bool = true
|
||||||
|
if fromStr == "" {
|
||||||
|
hasFrom = false
|
||||||
|
}
|
||||||
|
from64, err := strconv.ParseInt(fromStr, 10, 32)
|
||||||
|
var from = int(from64)
|
||||||
|
if hasFrom && err != nil {
|
||||||
|
return
|
||||||
|
} else if from < 0 {
|
||||||
|
from = len(asRunes) + from
|
||||||
|
}
|
||||||
|
toStr := from_to[1]
|
||||||
|
var hasTo bool = true
|
||||||
|
if toStr == "" {
|
||||||
|
hasTo = false
|
||||||
|
}
|
||||||
|
to64, err := strconv.ParseInt(toStr, 10, 32)
|
||||||
|
var to = int(to64)
|
||||||
|
if hasTo && err != nil {
|
||||||
|
return
|
||||||
|
} else if to < 0 {
|
||||||
|
to = len(asRunes) + to
|
||||||
|
}
|
||||||
|
if hasFrom && hasTo {
|
||||||
|
sb.WriteString(string(asRunes[from:to]))
|
||||||
|
} else if hasFrom {
|
||||||
|
sb.WriteString(string(asRunes[from:]))
|
||||||
|
} else if hasTo {
|
||||||
|
sb.WriteString(string(asRunes[:to]))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
pos, err := strconv.ParseInt(substring, 10, 32)
|
||||||
|
if err != nil || pos < 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
sb.WriteRune(asRunes[pos])
|
||||||
}
|
}
|
||||||
fromStr := from_to[0]
|
|
||||||
var hasFrom bool = true
|
|
||||||
if fromStr == "" {
|
|
||||||
hasFrom = false
|
|
||||||
}
|
|
||||||
from64, err := strconv.ParseInt(fromStr, 10, 32)
|
|
||||||
var from = int(from64)
|
|
||||||
if hasFrom && err != nil {
|
|
||||||
filter.Results = filter.Parent.Results
|
|
||||||
return
|
|
||||||
} else if from < 0 {
|
|
||||||
from = len(asRunes) + from
|
|
||||||
}
|
|
||||||
toStr := from_to[1]
|
|
||||||
var hasTo bool = true
|
|
||||||
if toStr == "" {
|
|
||||||
hasTo = false
|
|
||||||
}
|
|
||||||
to64, err := strconv.ParseInt(toStr, 10, 32)
|
|
||||||
var to = int(to64)
|
|
||||||
if hasTo && err != nil {
|
|
||||||
filter.Results = filter.Parent.Results
|
|
||||||
return
|
|
||||||
} else if to < 0 {
|
|
||||||
to = len(asRunes) + to
|
|
||||||
}
|
|
||||||
if hasFrom && hasTo {
|
|
||||||
sb.WriteString(string(asRunes[from:to]))
|
|
||||||
} else if hasFrom {
|
|
||||||
sb.WriteString(string(asRunes[from:]))
|
|
||||||
} else if hasTo {
|
|
||||||
sb.WriteString(string(asRunes[:to]))
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
pos, err := strconv.ParseInt(substring, 10, 32)
|
|
||||||
if err != nil || pos < 0 {
|
|
||||||
filter.Results = filter.Parent.Results
|
|
||||||
return
|
|
||||||
}
|
|
||||||
sb.WriteRune(asRunes[pos])
|
|
||||||
}
|
}
|
||||||
|
filter.Results = append(filter.Results, sb.String())
|
||||||
}
|
}
|
||||||
filter.Results = append(filter.Results, sb.String())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
|
@ -56,8 +56,8 @@ func TestFilterXPath(t *testing.T) {
|
||||||
testname := fmt.Sprintf("%s", test.Query)
|
testname := fmt.Sprintf("%s", test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
filter := Filter{
|
filter := Filter{
|
||||||
Parent: &Filter{
|
Parents: []*Filter{
|
||||||
Results: []string{HTML_STRING},
|
{Results: []string{HTML_STRING}},
|
||||||
},
|
},
|
||||||
Var1: test.Query,
|
Var1: test.Query,
|
||||||
}
|
}
|
||||||
|
@ -86,8 +86,8 @@ func TestFilterJSON(t *testing.T) {
|
||||||
testname := fmt.Sprintf("%s", test.Query)
|
testname := fmt.Sprintf("%s", test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
filter := Filter{
|
filter := Filter{
|
||||||
Parent: &Filter{
|
Parents: []*Filter{
|
||||||
Results: []string{JSON_STRING},
|
{Results: []string{JSON_STRING}},
|
||||||
},
|
},
|
||||||
Var1: test.Query,
|
Var1: test.Query,
|
||||||
}
|
}
|
||||||
|
@ -117,8 +117,8 @@ func TestFilterCSS(t *testing.T) {
|
||||||
testname := fmt.Sprintf("%s", test.Query)
|
testname := fmt.Sprintf("%s", test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
filter := Filter{
|
filter := Filter{
|
||||||
Parent: &Filter{
|
Parents: []*Filter{
|
||||||
Results: []string{HTML_STRING},
|
{Results: []string{HTML_STRING}},
|
||||||
},
|
},
|
||||||
Var1: test.Query,
|
Var1: test.Query,
|
||||||
}
|
}
|
||||||
|
@ -154,8 +154,8 @@ func TestFilterReplace(t *testing.T) {
|
||||||
testname := fmt.Sprintf("%s %s", test.Input, test.Query)
|
testname := fmt.Sprintf("%s %s", test.Input, test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
filter := Filter{
|
filter := Filter{
|
||||||
Parent: &Filter{
|
Parents: []*Filter{
|
||||||
Results: []string{test.Input},
|
{Results: []string{test.Input}},
|
||||||
},
|
},
|
||||||
Var1: test.Query,
|
Var1: test.Query,
|
||||||
}
|
}
|
||||||
|
@ -186,8 +186,8 @@ func TestFilterMatch(t *testing.T) {
|
||||||
testname := fmt.Sprintf("%s", test.Query)
|
testname := fmt.Sprintf("%s", test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
filter := Filter{
|
filter := Filter{
|
||||||
Parent: &Filter{
|
Parents: []*Filter{
|
||||||
Results: []string{test.Input},
|
{Results: []string{test.Input}},
|
||||||
},
|
},
|
||||||
Var1: test.Query,
|
Var1: test.Query,
|
||||||
}
|
}
|
||||||
|
@ -237,8 +237,8 @@ func TestFilterSubstring(t *testing.T) {
|
||||||
testname := fmt.Sprintf("%s %s", test.Input, test.Query)
|
testname := fmt.Sprintf("%s %s", test.Input, test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
filter := Filter{
|
filter := Filter{
|
||||||
Parent: &Filter{
|
Parents: []*Filter{
|
||||||
Results: []string{test.Input},
|
{Results: []string{test.Input}},
|
||||||
},
|
},
|
||||||
Var1: test.Query,
|
Var1: test.Query,
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue