got all filter types 'working' still bug but jay
This commit is contained in:
parent
f9eaf80a01
commit
18ded3534b
4 changed files with 51 additions and 22 deletions
1
go.mod
1
go.mod
|
@ -3,6 +3,7 @@ module broodjeaap.net/go-watch-and-tel
|
|||
go 1.18
|
||||
|
||||
require (
|
||||
github.com/andybalholm/cascadia v1.3.1
|
||||
github.com/antchfx/htmlquery v1.2.5
|
||||
github.com/gin-contrib/multitemplate v0.0.0-20220705015713-e21a0ba39de3
|
||||
github.com/gin-gonic/gin v1.8.1
|
||||
|
|
3
go.sum
3
go.sum
|
@ -38,6 +38,8 @@ cloud.google.com/go/storage v1.14.0/go.mod h1:GrKmX003DSIwi9o29oFT7YDnHYwZoctc3f
|
|||
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
|
||||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
||||
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
|
||||
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
|
||||
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
|
||||
github.com/antchfx/htmlquery v1.2.5 h1:1lXnx46/1wtv1E/kzmH8vrfMuUKYgkdDBA9pIdMJnk4=
|
||||
github.com/antchfx/htmlquery v1.2.5/go.mod h1:2MCVBzYVafPBmKbrmwB9F5xdd+IEgRY61ci2oOsOQVw=
|
||||
github.com/antchfx/xpath v1.2.1 h1:qhp4EW6aCOVr5XIkT+l6LJ9ck/JsUH/yyauNgTQkBF8=
|
||||
|
@ -315,6 +317,7 @@ golang.org/x/net v0.0.0-20201031054903-ff519b6c9102/go.mod h1:sp8m0HH+o8qH0wwXwY
|
|||
golang.org/x/net v0.0.0-20201209123823-ac852fbbde11/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
|
|
61
scraping.go
61
scraping.go
|
@ -9,6 +9,7 @@ import (
|
|||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/andybalholm/cascadia"
|
||||
"github.com/antchfx/htmlquery"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
@ -31,33 +32,34 @@ func getGroupResult(group *FilterGroup) []string {
|
|||
for _, resultString := range resultStrings {
|
||||
getFilterResult(resultString, &filter, &newStrings)
|
||||
}
|
||||
log.Println(len(resultStrings), len(newStrings))
|
||||
resultStrings = newStrings
|
||||
log.Println(resultStrings)
|
||||
newStrings = nil
|
||||
}
|
||||
return resultStrings
|
||||
}
|
||||
|
||||
func getFilterResult(s string, filter *Filter, newStrings *[]string) {
|
||||
switch {
|
||||
case filter.Type == "css":
|
||||
{
|
||||
//getFilterResultReplace(s, filter, newStrings)
|
||||
}
|
||||
case filter.Type == "xpath":
|
||||
{
|
||||
getFilterResultXPath(s, filter, newStrings)
|
||||
}
|
||||
case filter.Type == "css":
|
||||
{
|
||||
getFilterResultCSS(s, filter, newStrings)
|
||||
}
|
||||
case filter.Type == "replace":
|
||||
{
|
||||
//getFilterResultReplace(s, filter, newStrings)
|
||||
getFilterResultReplace(s, filter, newStrings)
|
||||
}
|
||||
case filter.Type == "regex":
|
||||
{
|
||||
//getFilterResultRegex(s, filter, newStrings)
|
||||
getFilterResultRegex(s, filter, newStrings)
|
||||
}
|
||||
case filter.Type == "substring":
|
||||
{
|
||||
//getFilterResultSubstring(s, filter, newStrings)
|
||||
getFilterResultSubstring(s, filter, newStrings)
|
||||
}
|
||||
default:
|
||||
|
||||
|
@ -78,19 +80,40 @@ func getFilterResultXPath(s string, filter *Filter, newStrings *[]string) {
|
|||
}
|
||||
}
|
||||
|
||||
func getFilterResultReplace(s string, filter *Filter) string {
|
||||
return strings.ReplaceAll(s, filter.From, filter.To)
|
||||
func getFilterResultCSS(s string, filter *Filter, newStrings *[]string) {
|
||||
doc, err := html.Parse(strings.NewReader(s))
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
return
|
||||
}
|
||||
sel, err := cascadia.Parse(filter.From)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
return
|
||||
}
|
||||
for _, node := range cascadia.QueryAll(doc, sel) {
|
||||
var b bytes.Buffer
|
||||
html.Render(&b, node)
|
||||
log.Println("test")
|
||||
log.Println(html.UnescapeString(b.String()))
|
||||
*newStrings = append(*newStrings, html.UnescapeString(b.String()))
|
||||
}
|
||||
}
|
||||
|
||||
func getFilterResultRegex(s string, filter *Filter) string {
|
||||
func getFilterResultReplace(s string, filter *Filter, newStrings *[]string) {
|
||||
*newStrings = append(*newStrings, strings.ReplaceAll(s, filter.From, filter.To))
|
||||
}
|
||||
|
||||
func getFilterResultRegex(s string, filter *Filter, newStrings *[]string) {
|
||||
regex, err := regexp.Compile(filter.From)
|
||||
if err != nil {
|
||||
return s
|
||||
log.Print(err)
|
||||
return
|
||||
}
|
||||
return regex.ReplaceAllString(s, filter.To)
|
||||
*newStrings = append(*newStrings, regex.ReplaceAllString(s, filter.To))
|
||||
}
|
||||
|
||||
func getFilterResultSubstring(s string, filter *Filter) string {
|
||||
func getFilterResultSubstring(s string, filter *Filter, newStrings *[]string) {
|
||||
substrings := strings.Split(filter.From, ",")
|
||||
var sb strings.Builder
|
||||
asRunes := []rune(s)
|
||||
|
@ -99,7 +122,7 @@ func getFilterResultSubstring(s string, filter *Filter) string {
|
|||
if strings.Contains(substring, ":") {
|
||||
from_to := strings.Split(substring, ":")
|
||||
if len(from_to) != 2 {
|
||||
return s
|
||||
return
|
||||
}
|
||||
fromStr := from_to[0]
|
||||
var hasFrom bool = true
|
||||
|
@ -109,7 +132,7 @@ func getFilterResultSubstring(s string, filter *Filter) string {
|
|||
from64, err := strconv.ParseInt(fromStr, 10, 32)
|
||||
var from = int(from64)
|
||||
if hasFrom && err != nil {
|
||||
return s
|
||||
return
|
||||
} else if from < 0 {
|
||||
from = len(asRunes) + from
|
||||
}
|
||||
|
@ -121,7 +144,7 @@ func getFilterResultSubstring(s string, filter *Filter) string {
|
|||
to64, err := strconv.ParseInt(toStr, 10, 32)
|
||||
var to = int(to64)
|
||||
if hasTo && err != nil {
|
||||
return s
|
||||
return
|
||||
} else if to < 0 {
|
||||
to = len(asRunes) + to
|
||||
}
|
||||
|
@ -135,10 +158,10 @@ func getFilterResultSubstring(s string, filter *Filter) string {
|
|||
} else {
|
||||
pos, err := strconv.ParseInt(substring, 10, 32)
|
||||
if err != nil || pos < 0 {
|
||||
return s
|
||||
return
|
||||
}
|
||||
sb.WriteRune(asRunes[pos])
|
||||
}
|
||||
}
|
||||
return sb.String()
|
||||
*newStrings = append(*newStrings, sb.String())
|
||||
}
|
||||
|
|
|
@ -39,14 +39,16 @@ func TestFilterSubstring(t *testing.T) {
|
|||
for _, test := range tests {
|
||||
testname := fmt.Sprintf("%s %s", test.Input, test.Query)
|
||||
t.Run(testname, func(t *testing.T) {
|
||||
result := getFilterResultSubstring(
|
||||
want := []string{test.Want}
|
||||
getFilterResultSubstring(
|
||||
test.Input,
|
||||
&Filter{
|
||||
From: test.Query,
|
||||
},
|
||||
&want,
|
||||
)
|
||||
if result != test.Want {
|
||||
t.Errorf("Got %s, want %s", result, test.Want)
|
||||
if want[0] != test.Want {
|
||||
t.Errorf("Got %s, want %s", want[0], test.Want)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue