got all filter types 'working' still bug but jay
This commit is contained in:
parent
f9eaf80a01
commit
18ded3534b
4 changed files with 51 additions and 22 deletions
1
go.mod
1
go.mod
|
@ -3,6 +3,7 @@ module broodjeaap.net/go-watch-and-tel
|
||||||
go 1.18
|
go 1.18
|
||||||
|
|
||||||
require (
|
require (
|
||||||
|
github.com/andybalholm/cascadia v1.3.1
|
||||||
github.com/antchfx/htmlquery v1.2.5
|
github.com/antchfx/htmlquery v1.2.5
|
||||||
github.com/gin-contrib/multitemplate v0.0.0-20220705015713-e21a0ba39de3
|
github.com/gin-contrib/multitemplate v0.0.0-20220705015713-e21a0ba39de3
|
||||||
github.com/gin-gonic/gin v1.8.1
|
github.com/gin-gonic/gin v1.8.1
|
||||||
|
|
3
go.sum
3
go.sum
|
@ -38,6 +38,8 @@ cloud.google.com/go/storage v1.14.0/go.mod h1:GrKmX003DSIwi9o29oFT7YDnHYwZoctc3f
|
||||||
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
|
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
|
||||||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
||||||
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
|
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
|
||||||
|
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
|
||||||
|
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
|
||||||
github.com/antchfx/htmlquery v1.2.5 h1:1lXnx46/1wtv1E/kzmH8vrfMuUKYgkdDBA9pIdMJnk4=
|
github.com/antchfx/htmlquery v1.2.5 h1:1lXnx46/1wtv1E/kzmH8vrfMuUKYgkdDBA9pIdMJnk4=
|
||||||
github.com/antchfx/htmlquery v1.2.5/go.mod h1:2MCVBzYVafPBmKbrmwB9F5xdd+IEgRY61ci2oOsOQVw=
|
github.com/antchfx/htmlquery v1.2.5/go.mod h1:2MCVBzYVafPBmKbrmwB9F5xdd+IEgRY61ci2oOsOQVw=
|
||||||
github.com/antchfx/xpath v1.2.1 h1:qhp4EW6aCOVr5XIkT+l6LJ9ck/JsUH/yyauNgTQkBF8=
|
github.com/antchfx/xpath v1.2.1 h1:qhp4EW6aCOVr5XIkT+l6LJ9ck/JsUH/yyauNgTQkBF8=
|
||||||
|
@ -315,6 +317,7 @@ golang.org/x/net v0.0.0-20201031054903-ff519b6c9102/go.mod h1:sp8m0HH+o8qH0wwXwY
|
||||||
golang.org/x/net v0.0.0-20201209123823-ac852fbbde11/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
golang.org/x/net v0.0.0-20201209123823-ac852fbbde11/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||||
golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||||
|
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||||
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0=
|
golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0=
|
||||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||||
|
|
61
scraping.go
61
scraping.go
|
@ -9,6 +9,7 @@ import (
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/andybalholm/cascadia"
|
||||||
"github.com/antchfx/htmlquery"
|
"github.com/antchfx/htmlquery"
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
@ -31,33 +32,34 @@ func getGroupResult(group *FilterGroup) []string {
|
||||||
for _, resultString := range resultStrings {
|
for _, resultString := range resultStrings {
|
||||||
getFilterResult(resultString, &filter, &newStrings)
|
getFilterResult(resultString, &filter, &newStrings)
|
||||||
}
|
}
|
||||||
|
log.Println(len(resultStrings), len(newStrings))
|
||||||
resultStrings = newStrings
|
resultStrings = newStrings
|
||||||
log.Println(resultStrings)
|
newStrings = nil
|
||||||
}
|
}
|
||||||
return resultStrings
|
return resultStrings
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResult(s string, filter *Filter, newStrings *[]string) {
|
func getFilterResult(s string, filter *Filter, newStrings *[]string) {
|
||||||
switch {
|
switch {
|
||||||
case filter.Type == "css":
|
|
||||||
{
|
|
||||||
//getFilterResultReplace(s, filter, newStrings)
|
|
||||||
}
|
|
||||||
case filter.Type == "xpath":
|
case filter.Type == "xpath":
|
||||||
{
|
{
|
||||||
getFilterResultXPath(s, filter, newStrings)
|
getFilterResultXPath(s, filter, newStrings)
|
||||||
}
|
}
|
||||||
|
case filter.Type == "css":
|
||||||
|
{
|
||||||
|
getFilterResultCSS(s, filter, newStrings)
|
||||||
|
}
|
||||||
case filter.Type == "replace":
|
case filter.Type == "replace":
|
||||||
{
|
{
|
||||||
//getFilterResultReplace(s, filter, newStrings)
|
getFilterResultReplace(s, filter, newStrings)
|
||||||
}
|
}
|
||||||
case filter.Type == "regex":
|
case filter.Type == "regex":
|
||||||
{
|
{
|
||||||
//getFilterResultRegex(s, filter, newStrings)
|
getFilterResultRegex(s, filter, newStrings)
|
||||||
}
|
}
|
||||||
case filter.Type == "substring":
|
case filter.Type == "substring":
|
||||||
{
|
{
|
||||||
//getFilterResultSubstring(s, filter, newStrings)
|
getFilterResultSubstring(s, filter, newStrings)
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
|
||||||
|
@ -78,19 +80,40 @@ func getFilterResultXPath(s string, filter *Filter, newStrings *[]string) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResultReplace(s string, filter *Filter) string {
|
func getFilterResultCSS(s string, filter *Filter, newStrings *[]string) {
|
||||||
return strings.ReplaceAll(s, filter.From, filter.To)
|
doc, err := html.Parse(strings.NewReader(s))
|
||||||
|
if err != nil {
|
||||||
|
log.Print(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
sel, err := cascadia.Parse(filter.From)
|
||||||
|
if err != nil {
|
||||||
|
log.Print(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, node := range cascadia.QueryAll(doc, sel) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
html.Render(&b, node)
|
||||||
|
log.Println("test")
|
||||||
|
log.Println(html.UnescapeString(b.String()))
|
||||||
|
*newStrings = append(*newStrings, html.UnescapeString(b.String()))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResultRegex(s string, filter *Filter) string {
|
func getFilterResultReplace(s string, filter *Filter, newStrings *[]string) {
|
||||||
|
*newStrings = append(*newStrings, strings.ReplaceAll(s, filter.From, filter.To))
|
||||||
|
}
|
||||||
|
|
||||||
|
func getFilterResultRegex(s string, filter *Filter, newStrings *[]string) {
|
||||||
regex, err := regexp.Compile(filter.From)
|
regex, err := regexp.Compile(filter.From)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return s
|
log.Print(err)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
return regex.ReplaceAllString(s, filter.To)
|
*newStrings = append(*newStrings, regex.ReplaceAllString(s, filter.To))
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResultSubstring(s string, filter *Filter) string {
|
func getFilterResultSubstring(s string, filter *Filter, newStrings *[]string) {
|
||||||
substrings := strings.Split(filter.From, ",")
|
substrings := strings.Split(filter.From, ",")
|
||||||
var sb strings.Builder
|
var sb strings.Builder
|
||||||
asRunes := []rune(s)
|
asRunes := []rune(s)
|
||||||
|
@ -99,7 +122,7 @@ func getFilterResultSubstring(s string, filter *Filter) string {
|
||||||
if strings.Contains(substring, ":") {
|
if strings.Contains(substring, ":") {
|
||||||
from_to := strings.Split(substring, ":")
|
from_to := strings.Split(substring, ":")
|
||||||
if len(from_to) != 2 {
|
if len(from_to) != 2 {
|
||||||
return s
|
return
|
||||||
}
|
}
|
||||||
fromStr := from_to[0]
|
fromStr := from_to[0]
|
||||||
var hasFrom bool = true
|
var hasFrom bool = true
|
||||||
|
@ -109,7 +132,7 @@ func getFilterResultSubstring(s string, filter *Filter) string {
|
||||||
from64, err := strconv.ParseInt(fromStr, 10, 32)
|
from64, err := strconv.ParseInt(fromStr, 10, 32)
|
||||||
var from = int(from64)
|
var from = int(from64)
|
||||||
if hasFrom && err != nil {
|
if hasFrom && err != nil {
|
||||||
return s
|
return
|
||||||
} else if from < 0 {
|
} else if from < 0 {
|
||||||
from = len(asRunes) + from
|
from = len(asRunes) + from
|
||||||
}
|
}
|
||||||
|
@ -121,7 +144,7 @@ func getFilterResultSubstring(s string, filter *Filter) string {
|
||||||
to64, err := strconv.ParseInt(toStr, 10, 32)
|
to64, err := strconv.ParseInt(toStr, 10, 32)
|
||||||
var to = int(to64)
|
var to = int(to64)
|
||||||
if hasTo && err != nil {
|
if hasTo && err != nil {
|
||||||
return s
|
return
|
||||||
} else if to < 0 {
|
} else if to < 0 {
|
||||||
to = len(asRunes) + to
|
to = len(asRunes) + to
|
||||||
}
|
}
|
||||||
|
@ -135,10 +158,10 @@ func getFilterResultSubstring(s string, filter *Filter) string {
|
||||||
} else {
|
} else {
|
||||||
pos, err := strconv.ParseInt(substring, 10, 32)
|
pos, err := strconv.ParseInt(substring, 10, 32)
|
||||||
if err != nil || pos < 0 {
|
if err != nil || pos < 0 {
|
||||||
return s
|
return
|
||||||
}
|
}
|
||||||
sb.WriteRune(asRunes[pos])
|
sb.WriteRune(asRunes[pos])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return sb.String()
|
*newStrings = append(*newStrings, sb.String())
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,14 +39,16 @@ func TestFilterSubstring(t *testing.T) {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
testname := fmt.Sprintf("%s %s", test.Input, test.Query)
|
testname := fmt.Sprintf("%s %s", test.Input, test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
result := getFilterResultSubstring(
|
want := []string{test.Want}
|
||||||
|
getFilterResultSubstring(
|
||||||
test.Input,
|
test.Input,
|
||||||
&Filter{
|
&Filter{
|
||||||
From: test.Query,
|
From: test.Query,
|
||||||
},
|
},
|
||||||
|
&want,
|
||||||
)
|
)
|
||||||
if result != test.Want {
|
if want[0] != test.Want {
|
||||||
t.Errorf("Got %s, want %s", result, test.Want)
|
t.Errorf("Got %s, want %s", want[0], test.Want)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue