From 18ded3534bd90f9cee8d6109f3d5539aa8df6a43 Mon Sep 17 00:00:00 2001 From: BroodjeAap Date: Tue, 2 Aug 2022 19:59:36 +0000 Subject: [PATCH] got all filter types 'working' still bug but jay --- go.mod | 1 + go.sum | 3 +++ scraping.go | 61 +++++++++++++++++++++++++++++++++--------------- scraping_test.go | 8 ++++--- 4 files changed, 51 insertions(+), 22 deletions(-) diff --git a/go.mod b/go.mod index a4c1147..afcd8bb 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module broodjeaap.net/go-watch-and-tel go 1.18 require ( + github.com/andybalholm/cascadia v1.3.1 github.com/antchfx/htmlquery v1.2.5 github.com/gin-contrib/multitemplate v0.0.0-20220705015713-e21a0ba39de3 github.com/gin-gonic/gin v1.8.1 diff --git a/go.sum b/go.sum index 0eb07b4..648563c 100644 --- a/go.sum +++ b/go.sum @@ -38,6 +38,8 @@ cloud.google.com/go/storage v1.14.0/go.mod h1:GrKmX003DSIwi9o29oFT7YDnHYwZoctc3f dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/antchfx/htmlquery v1.2.5 h1:1lXnx46/1wtv1E/kzmH8vrfMuUKYgkdDBA9pIdMJnk4= github.com/antchfx/htmlquery v1.2.5/go.mod h1:2MCVBzYVafPBmKbrmwB9F5xdd+IEgRY61ci2oOsOQVw= github.com/antchfx/xpath v1.2.1 h1:qhp4EW6aCOVr5XIkT+l6LJ9ck/JsUH/yyauNgTQkBF8= @@ -315,6 +317,7 @@ golang.org/x/net v0.0.0-20201031054903-ff519b6c9102/go.mod h1:sp8m0HH+o8qH0wwXwY golang.org/x/net v0.0.0-20201209123823-ac852fbbde11/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= diff --git a/scraping.go b/scraping.go index 63b4cd0..8680510 100644 --- a/scraping.go +++ b/scraping.go @@ -9,6 +9,7 @@ import ( "strconv" "strings" + "github.com/andybalholm/cascadia" "github.com/antchfx/htmlquery" "golang.org/x/net/html" ) @@ -31,33 +32,34 @@ func getGroupResult(group *FilterGroup) []string { for _, resultString := range resultStrings { getFilterResult(resultString, &filter, &newStrings) } + log.Println(len(resultStrings), len(newStrings)) resultStrings = newStrings - log.Println(resultStrings) + newStrings = nil } return resultStrings } func getFilterResult(s string, filter *Filter, newStrings *[]string) { switch { - case filter.Type == "css": - { - //getFilterResultReplace(s, filter, newStrings) - } case filter.Type == "xpath": { getFilterResultXPath(s, filter, newStrings) } + case filter.Type == "css": + { + getFilterResultCSS(s, filter, newStrings) + } case filter.Type == "replace": { - //getFilterResultReplace(s, filter, newStrings) + getFilterResultReplace(s, filter, newStrings) } case filter.Type == "regex": { - //getFilterResultRegex(s, filter, newStrings) + getFilterResultRegex(s, filter, newStrings) } case filter.Type == "substring": { - //getFilterResultSubstring(s, filter, newStrings) + getFilterResultSubstring(s, filter, newStrings) } default: @@ -78,19 +80,40 @@ func getFilterResultXPath(s string, filter *Filter, newStrings *[]string) { } } -func getFilterResultReplace(s string, filter *Filter) string { - return strings.ReplaceAll(s, filter.From, filter.To) +func getFilterResultCSS(s string, filter *Filter, newStrings *[]string) { + doc, err := html.Parse(strings.NewReader(s)) + if err != nil { + log.Print(err) + return + } + sel, err := cascadia.Parse(filter.From) + if err != nil { + log.Print(err) + return + } + for _, node := range cascadia.QueryAll(doc, sel) { + var b bytes.Buffer + html.Render(&b, node) + log.Println("test") + log.Println(html.UnescapeString(b.String())) + *newStrings = append(*newStrings, html.UnescapeString(b.String())) + } } -func getFilterResultRegex(s string, filter *Filter) string { +func getFilterResultReplace(s string, filter *Filter, newStrings *[]string) { + *newStrings = append(*newStrings, strings.ReplaceAll(s, filter.From, filter.To)) +} + +func getFilterResultRegex(s string, filter *Filter, newStrings *[]string) { regex, err := regexp.Compile(filter.From) if err != nil { - return s + log.Print(err) + return } - return regex.ReplaceAllString(s, filter.To) + *newStrings = append(*newStrings, regex.ReplaceAllString(s, filter.To)) } -func getFilterResultSubstring(s string, filter *Filter) string { +func getFilterResultSubstring(s string, filter *Filter, newStrings *[]string) { substrings := strings.Split(filter.From, ",") var sb strings.Builder asRunes := []rune(s) @@ -99,7 +122,7 @@ func getFilterResultSubstring(s string, filter *Filter) string { if strings.Contains(substring, ":") { from_to := strings.Split(substring, ":") if len(from_to) != 2 { - return s + return } fromStr := from_to[0] var hasFrom bool = true @@ -109,7 +132,7 @@ func getFilterResultSubstring(s string, filter *Filter) string { from64, err := strconv.ParseInt(fromStr, 10, 32) var from = int(from64) if hasFrom && err != nil { - return s + return } else if from < 0 { from = len(asRunes) + from } @@ -121,7 +144,7 @@ func getFilterResultSubstring(s string, filter *Filter) string { to64, err := strconv.ParseInt(toStr, 10, 32) var to = int(to64) if hasTo && err != nil { - return s + return } else if to < 0 { to = len(asRunes) + to } @@ -135,10 +158,10 @@ func getFilterResultSubstring(s string, filter *Filter) string { } else { pos, err := strconv.ParseInt(substring, 10, 32) if err != nil || pos < 0 { - return s + return } sb.WriteRune(asRunes[pos]) } } - return sb.String() + *newStrings = append(*newStrings, sb.String()) } diff --git a/scraping_test.go b/scraping_test.go index e90486c..7ac0cfc 100644 --- a/scraping_test.go +++ b/scraping_test.go @@ -39,14 +39,16 @@ func TestFilterSubstring(t *testing.T) { for _, test := range tests { testname := fmt.Sprintf("%s %s", test.Input, test.Query) t.Run(testname, func(t *testing.T) { - result := getFilterResultSubstring( + want := []string{test.Want} + getFilterResultSubstring( test.Input, &Filter{ From: test.Query, }, + &want, ) - if result != test.Want { - t.Errorf("Got %s, want %s", result, test.Want) + if want[0] != test.Want { + t.Errorf("Got %s, want %s", want[0], test.Want) } }) }