From 63763582c304766e7a1074c265d9c1c0670319b0 Mon Sep 17 00:00:00 2001 From: BroodjeAap Date: Sat, 3 Sep 2022 14:52:56 +0000 Subject: [PATCH] fixed scraping methods with and tests --- scraping.go | 43 +++++++++++--------- scraping_test.go | 100 +++++++++++++++++++++++++++++------------------ 2 files changed, 86 insertions(+), 57 deletions(-) diff --git a/scraping.go b/scraping.go index 3339250..0401a2d 100644 --- a/scraping.go +++ b/scraping.go @@ -56,7 +56,7 @@ func getFilterResultXPath(filter *Filter) { log.Println("Filter", filter.Name, "called without parent for", filter.Type) return } - for _, result := range filter.Parent.results { + for _, result := range filter.Parent.Results { doc, err := htmlquery.Parse(strings.NewReader(result)) if err != nil { log.Print(err) @@ -66,7 +66,7 @@ func getFilterResultXPath(filter *Filter) { for _, node := range nodes { var b bytes.Buffer html.Render(&b, node) - filter.results = append(filter.results, html.UnescapeString(b.String())) + filter.Results = append(filter.Results, html.UnescapeString(b.String())) } } } @@ -76,9 +76,9 @@ func getFilterResultJSON(filter *Filter) { log.Println("Filter", filter.Name, "called without parent for", filter.Type) return } - for _, result := range filter.Parent.results { + for _, result := range filter.Parent.Results { for _, match := range gjson.Get(result, filter.Var1).Array() { - filter.results = append(filter.results, match.String()) + filter.Results = append(filter.Results, match.String()) } } } @@ -88,7 +88,7 @@ func getFilterResultCSS(filter *Filter) { log.Println("Filter", filter.Name, "called without parent for", filter.Type) return } - for _, result := range filter.results { + for _, result := range filter.Parent.Results { doc, err := html.Parse(strings.NewReader(result)) if err != nil { log.Print(err) @@ -102,7 +102,8 @@ func getFilterResultCSS(filter *Filter) { for _, node := range cascadia.QueryAll(doc, sel) { var b bytes.Buffer html.Render(&b, node) - filter.results = append(filter.results, html.UnescapeString(b.String())) + log.Println(b.String()) + filter.Results = append(filter.Results, html.UnescapeString(b.String())) } } } @@ -112,16 +113,16 @@ func getFilterResultReplace(filter *Filter) { log.Println("Filter", filter.Name, "called without parent for", filter.Type) return } - for _, result := range filter.results { + for _, result := range filter.Parent.Results { r, err := regexp.Compile(filter.Var1) if err != nil { log.Print(err) continue } if filter.Var2 == nil { - filter.results = append(filter.results, r.ReplaceAllString(result, "")) + filter.Results = append(filter.Results, r.ReplaceAllString(result, "")) } else { - filter.results = append(filter.results, r.ReplaceAllString(result, *filter.Var2)) + filter.Results = append(filter.Results, r.ReplaceAllString(result, *filter.Var2)) } } } @@ -131,14 +132,16 @@ func getFilterResultMatch(filter *Filter) { log.Println("Filter", filter.Name, "called without parent for", filter.Type) return } - for _, result := range filter.results { - r, err := regexp.Compile(filter.Var1) - if err != nil { - log.Print(err) - continue - } + r, err := regexp.Compile(filter.Var1) + if err != nil { + log.Print(err) + return + } + for _, result := range filter.Parent.Results { + log.Println(">", result) for _, str := range r.FindAllString(result, -1) { - filter.results = append(filter.results, str) + log.Println(">>", str) + filter.Results = append(filter.Results, str) } } } @@ -148,7 +151,7 @@ func getFilterResultSubstring(filter *Filter) { log.Println("Filter", filter.Name, "called without parent for", filter.Type) return } - for _, result := range filter.results { + for _, result := range filter.Parent.Results { substrings := strings.Split(filter.Var1, ",") var sb strings.Builder asRunes := []rune(result) @@ -157,6 +160,7 @@ func getFilterResultSubstring(filter *Filter) { if strings.Contains(substring, ":") { from_to := strings.Split(substring, ":") if len(from_to) != 2 { + filter.Results = filter.Parent.Results return } fromStr := from_to[0] @@ -167,6 +171,7 @@ func getFilterResultSubstring(filter *Filter) { from64, err := strconv.ParseInt(fromStr, 10, 32) var from = int(from64) if hasFrom && err != nil { + filter.Results = filter.Parent.Results return } else if from < 0 { from = len(asRunes) + from @@ -179,6 +184,7 @@ func getFilterResultSubstring(filter *Filter) { to64, err := strconv.ParseInt(toStr, 10, 32) var to = int(to64) if hasTo && err != nil { + filter.Results = filter.Parent.Results return } else if to < 0 { to = len(asRunes) + to @@ -193,11 +199,12 @@ func getFilterResultSubstring(filter *Filter) { } else { pos, err := strconv.ParseInt(substring, 10, 32) if err != nil || pos < 0 { + filter.Results = filter.Parent.Results return } sb.WriteRune(asRunes[pos]) } } - filter.results = append(filter.results, sb.String()) + filter.Results = append(filter.Results, sb.String()) } } diff --git a/scraping_test.go b/scraping_test.go index 6e4e92b..dbb8096 100644 --- a/scraping_test.go +++ b/scraping_test.go @@ -55,14 +55,17 @@ func TestFilterXPath(t *testing.T) { for _, test := range tests { testname := fmt.Sprintf("%s", test.Query) t.Run(testname, func(t *testing.T) { - want := []string{} - getFilterResultXPath( - &Filter{ - Var1: test.Query, + filter := Filter{ + Parent: &Filter{ + Results: []string{HTML_STRING}, }, + Var1: test.Query, + } + getFilterResultXPath( + &filter, ) - if !reflect.DeepEqual(test.Want, want) { - t.Errorf("Got %s, want %s", want, test.Want) + if !reflect.DeepEqual(test.Want, filter.Results) { + t.Errorf("Got %s, want %s", filter.Results, test.Want) } }) } @@ -82,14 +85,17 @@ func TestFilterJSON(t *testing.T) { for _, test := range tests { testname := fmt.Sprintf("%s", test.Query) t.Run(testname, func(t *testing.T) { - want := []string{} - getFilterResultJSON( - &Filter{ - Var1: test.Query, + filter := Filter{ + Parent: &Filter{ + Results: []string{JSON_STRING}, }, + Var1: test.Query, + } + getFilterResultJSON( + &filter, ) - if !reflect.DeepEqual(test.Want, want) { - t.Errorf("Got %s, want %s", want, test.Want) + if !reflect.DeepEqual(test.Want, filter.Results) { + t.Errorf("Got %s, want %s", filter.Results, test.Want) } }) } @@ -110,14 +116,17 @@ func TestFilterCSS(t *testing.T) { for _, test := range tests { testname := fmt.Sprintf("%s", test.Query) t.Run(testname, func(t *testing.T) { - want := []string{} - getFilterResultCSS( - &Filter{ - Var1: test.Query, + filter := Filter{ + Parent: &Filter{ + Results: []string{HTML_STRING}, }, + Var1: test.Query, + } + getFilterResultCSS( + &filter, ) - if !reflect.DeepEqual(test.Want, want) { - t.Errorf("Got %s, want %s", want, test.Want) + if !reflect.DeepEqual(test.Want, filter.Results) { + t.Errorf("Got %s, want %s", filter.Results, test.Want) } }) } @@ -131,24 +140,30 @@ func TestFilterReplace(t *testing.T) { }{ {"0123456789", "0", "123456789"}, {"0123456789", "9", "012345678"}, - {"0123456789", "3456", "01278"}, - {"0123456789_0123456789", "3456", "01278_01278"}, + {"0123456789", "3456", "012789"}, + {"0123456789_0123456789", "3456", "012789_012789"}, {"世界日本語", "世", "界日本語"}, - {"世界日本語", "語", "世界日本語"}, + {"世界日本語", "語", "世界日本"}, {"世界日_世界日_世界日", "界", "世日_世日_世日"}, + // TODO add replace tests + // TODO add regex tests + // TODO add regex replace tests } for _, test := range tests { testname := fmt.Sprintf("%s %s", test.Input, test.Query) t.Run(testname, func(t *testing.T) { - want := []string{test.Want} - getFilterResultReplace( - &Filter{ - Var1: test.Query, + filter := Filter{ + Parent: &Filter{ + Results: []string{test.Input}, }, + Var1: test.Query, + } + getFilterResultReplace( + &filter, ) - if want[0] != test.Want { - t.Errorf("Got %s, want %s", want[0], test.Want) + if filter.Results[0] != test.Want { + t.Errorf("Got %s, want %s", filter.Results, test.Want) } }) } @@ -170,14 +185,18 @@ func TestFilterMatch(t *testing.T) { for _, test := range tests { testname := fmt.Sprintf("%s", test.Query) t.Run(testname, func(t *testing.T) { - want := []string{} - getFilterResultMatch( - &Filter{ - Var1: test.Query, + filter := Filter{ + Parent: &Filter{ + Results: []string{test.Input}, }, + Var1: test.Query, + } + getFilterResultMatch( + &filter, ) - if !reflect.DeepEqual(test.Want, want) { - t.Errorf("Got %s, want %s", want, test.Want) + // len() thing cuz filterResults == nil and test.Want == [], same thing but not really... + if !(len(filter.Results) == 0 && len(test.Want) == 0) && !reflect.DeepEqual(filter.Results, test.Want) { + t.Errorf("Got %s, want %s", filter.Results, test.Want) } }) } @@ -217,14 +236,17 @@ func TestFilterSubstring(t *testing.T) { for _, test := range tests { testname := fmt.Sprintf("%s %s", test.Input, test.Query) t.Run(testname, func(t *testing.T) { - want := []string{test.Want} - getFilterResultSubstring( - &Filter{ - Var1: test.Query, + filter := Filter{ + Parent: &Filter{ + Results: []string{test.Input}, }, + Var1: test.Query, + } + getFilterResultSubstring( + &filter, ) - if want[0] != test.Want { - t.Errorf("Got %s, want %s", want[0], test.Want) + if filter.Results[0] != test.Want { + t.Errorf("Got %s, want %s", filter.Results, test.Want) } }) }