fixed scraping methods with and tests
This commit is contained in:
parent
27695ebdf3
commit
63763582c3
2 changed files with 86 additions and 57 deletions
43
scraping.go
43
scraping.go
|
@ -56,7 +56,7 @@ func getFilterResultXPath(filter *Filter) {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.Parent.results {
|
for _, result := range filter.Parent.Results {
|
||||||
doc, err := htmlquery.Parse(strings.NewReader(result))
|
doc, err := htmlquery.Parse(strings.NewReader(result))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
|
@ -66,7 +66,7 @@ func getFilterResultXPath(filter *Filter) {
|
||||||
for _, node := range nodes {
|
for _, node := range nodes {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
html.Render(&b, node)
|
html.Render(&b, node)
|
||||||
filter.results = append(filter.results, html.UnescapeString(b.String()))
|
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -76,9 +76,9 @@ func getFilterResultJSON(filter *Filter) {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.Parent.results {
|
for _, result := range filter.Parent.Results {
|
||||||
for _, match := range gjson.Get(result, filter.Var1).Array() {
|
for _, match := range gjson.Get(result, filter.Var1).Array() {
|
||||||
filter.results = append(filter.results, match.String())
|
filter.Results = append(filter.Results, match.String())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -88,7 +88,7 @@ func getFilterResultCSS(filter *Filter) {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.results {
|
for _, result := range filter.Parent.Results {
|
||||||
doc, err := html.Parse(strings.NewReader(result))
|
doc, err := html.Parse(strings.NewReader(result))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
|
@ -102,7 +102,8 @@ func getFilterResultCSS(filter *Filter) {
|
||||||
for _, node := range cascadia.QueryAll(doc, sel) {
|
for _, node := range cascadia.QueryAll(doc, sel) {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
html.Render(&b, node)
|
html.Render(&b, node)
|
||||||
filter.results = append(filter.results, html.UnescapeString(b.String()))
|
log.Println(b.String())
|
||||||
|
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -112,16 +113,16 @@ func getFilterResultReplace(filter *Filter) {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.results {
|
for _, result := range filter.Parent.Results {
|
||||||
r, err := regexp.Compile(filter.Var1)
|
r, err := regexp.Compile(filter.Var1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if filter.Var2 == nil {
|
if filter.Var2 == nil {
|
||||||
filter.results = append(filter.results, r.ReplaceAllString(result, ""))
|
filter.Results = append(filter.Results, r.ReplaceAllString(result, ""))
|
||||||
} else {
|
} else {
|
||||||
filter.results = append(filter.results, r.ReplaceAllString(result, *filter.Var2))
|
filter.Results = append(filter.Results, r.ReplaceAllString(result, *filter.Var2))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -131,14 +132,16 @@ func getFilterResultMatch(filter *Filter) {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.results {
|
r, err := regexp.Compile(filter.Var1)
|
||||||
r, err := regexp.Compile(filter.Var1)
|
if err != nil {
|
||||||
if err != nil {
|
log.Print(err)
|
||||||
log.Print(err)
|
return
|
||||||
continue
|
}
|
||||||
}
|
for _, result := range filter.Parent.Results {
|
||||||
|
log.Println(">", result)
|
||||||
for _, str := range r.FindAllString(result, -1) {
|
for _, str := range r.FindAllString(result, -1) {
|
||||||
filter.results = append(filter.results, str)
|
log.Println(">>", str)
|
||||||
|
filter.Results = append(filter.Results, str)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -148,7 +151,7 @@ func getFilterResultSubstring(filter *Filter) {
|
||||||
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
log.Println("Filter", filter.Name, "called without parent for", filter.Type)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, result := range filter.results {
|
for _, result := range filter.Parent.Results {
|
||||||
substrings := strings.Split(filter.Var1, ",")
|
substrings := strings.Split(filter.Var1, ",")
|
||||||
var sb strings.Builder
|
var sb strings.Builder
|
||||||
asRunes := []rune(result)
|
asRunes := []rune(result)
|
||||||
|
@ -157,6 +160,7 @@ func getFilterResultSubstring(filter *Filter) {
|
||||||
if strings.Contains(substring, ":") {
|
if strings.Contains(substring, ":") {
|
||||||
from_to := strings.Split(substring, ":")
|
from_to := strings.Split(substring, ":")
|
||||||
if len(from_to) != 2 {
|
if len(from_to) != 2 {
|
||||||
|
filter.Results = filter.Parent.Results
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
fromStr := from_to[0]
|
fromStr := from_to[0]
|
||||||
|
@ -167,6 +171,7 @@ func getFilterResultSubstring(filter *Filter) {
|
||||||
from64, err := strconv.ParseInt(fromStr, 10, 32)
|
from64, err := strconv.ParseInt(fromStr, 10, 32)
|
||||||
var from = int(from64)
|
var from = int(from64)
|
||||||
if hasFrom && err != nil {
|
if hasFrom && err != nil {
|
||||||
|
filter.Results = filter.Parent.Results
|
||||||
return
|
return
|
||||||
} else if from < 0 {
|
} else if from < 0 {
|
||||||
from = len(asRunes) + from
|
from = len(asRunes) + from
|
||||||
|
@ -179,6 +184,7 @@ func getFilterResultSubstring(filter *Filter) {
|
||||||
to64, err := strconv.ParseInt(toStr, 10, 32)
|
to64, err := strconv.ParseInt(toStr, 10, 32)
|
||||||
var to = int(to64)
|
var to = int(to64)
|
||||||
if hasTo && err != nil {
|
if hasTo && err != nil {
|
||||||
|
filter.Results = filter.Parent.Results
|
||||||
return
|
return
|
||||||
} else if to < 0 {
|
} else if to < 0 {
|
||||||
to = len(asRunes) + to
|
to = len(asRunes) + to
|
||||||
|
@ -193,11 +199,12 @@ func getFilterResultSubstring(filter *Filter) {
|
||||||
} else {
|
} else {
|
||||||
pos, err := strconv.ParseInt(substring, 10, 32)
|
pos, err := strconv.ParseInt(substring, 10, 32)
|
||||||
if err != nil || pos < 0 {
|
if err != nil || pos < 0 {
|
||||||
|
filter.Results = filter.Parent.Results
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
sb.WriteRune(asRunes[pos])
|
sb.WriteRune(asRunes[pos])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
filter.results = append(filter.results, sb.String())
|
filter.Results = append(filter.Results, sb.String())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
100
scraping_test.go
100
scraping_test.go
|
@ -55,14 +55,17 @@ func TestFilterXPath(t *testing.T) {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
testname := fmt.Sprintf("%s", test.Query)
|
testname := fmt.Sprintf("%s", test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
want := []string{}
|
filter := Filter{
|
||||||
getFilterResultXPath(
|
Parent: &Filter{
|
||||||
&Filter{
|
Results: []string{HTML_STRING},
|
||||||
Var1: test.Query,
|
|
||||||
},
|
},
|
||||||
|
Var1: test.Query,
|
||||||
|
}
|
||||||
|
getFilterResultXPath(
|
||||||
|
&filter,
|
||||||
)
|
)
|
||||||
if !reflect.DeepEqual(test.Want, want) {
|
if !reflect.DeepEqual(test.Want, filter.Results) {
|
||||||
t.Errorf("Got %s, want %s", want, test.Want)
|
t.Errorf("Got %s, want %s", filter.Results, test.Want)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -82,14 +85,17 @@ func TestFilterJSON(t *testing.T) {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
testname := fmt.Sprintf("%s", test.Query)
|
testname := fmt.Sprintf("%s", test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
want := []string{}
|
filter := Filter{
|
||||||
getFilterResultJSON(
|
Parent: &Filter{
|
||||||
&Filter{
|
Results: []string{JSON_STRING},
|
||||||
Var1: test.Query,
|
|
||||||
},
|
},
|
||||||
|
Var1: test.Query,
|
||||||
|
}
|
||||||
|
getFilterResultJSON(
|
||||||
|
&filter,
|
||||||
)
|
)
|
||||||
if !reflect.DeepEqual(test.Want, want) {
|
if !reflect.DeepEqual(test.Want, filter.Results) {
|
||||||
t.Errorf("Got %s, want %s", want, test.Want)
|
t.Errorf("Got %s, want %s", filter.Results, test.Want)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -110,14 +116,17 @@ func TestFilterCSS(t *testing.T) {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
testname := fmt.Sprintf("%s", test.Query)
|
testname := fmt.Sprintf("%s", test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
want := []string{}
|
filter := Filter{
|
||||||
getFilterResultCSS(
|
Parent: &Filter{
|
||||||
&Filter{
|
Results: []string{HTML_STRING},
|
||||||
Var1: test.Query,
|
|
||||||
},
|
},
|
||||||
|
Var1: test.Query,
|
||||||
|
}
|
||||||
|
getFilterResultCSS(
|
||||||
|
&filter,
|
||||||
)
|
)
|
||||||
if !reflect.DeepEqual(test.Want, want) {
|
if !reflect.DeepEqual(test.Want, filter.Results) {
|
||||||
t.Errorf("Got %s, want %s", want, test.Want)
|
t.Errorf("Got %s, want %s", filter.Results, test.Want)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -131,24 +140,30 @@ func TestFilterReplace(t *testing.T) {
|
||||||
}{
|
}{
|
||||||
{"0123456789", "0", "123456789"},
|
{"0123456789", "0", "123456789"},
|
||||||
{"0123456789", "9", "012345678"},
|
{"0123456789", "9", "012345678"},
|
||||||
{"0123456789", "3456", "01278"},
|
{"0123456789", "3456", "012789"},
|
||||||
{"0123456789_0123456789", "3456", "01278_01278"},
|
{"0123456789_0123456789", "3456", "012789_012789"},
|
||||||
{"世界日本語", "世", "界日本語"},
|
{"世界日本語", "世", "界日本語"},
|
||||||
{"世界日本語", "語", "世界日本語"},
|
{"世界日本語", "語", "世界日本"},
|
||||||
{"世界日_世界日_世界日", "界", "世日_世日_世日"},
|
{"世界日_世界日_世界日", "界", "世日_世日_世日"},
|
||||||
|
// TODO add replace tests
|
||||||
|
// TODO add regex tests
|
||||||
|
// TODO add regex replace tests
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
testname := fmt.Sprintf("%s %s", test.Input, test.Query)
|
testname := fmt.Sprintf("%s %s", test.Input, test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
want := []string{test.Want}
|
filter := Filter{
|
||||||
getFilterResultReplace(
|
Parent: &Filter{
|
||||||
&Filter{
|
Results: []string{test.Input},
|
||||||
Var1: test.Query,
|
|
||||||
},
|
},
|
||||||
|
Var1: test.Query,
|
||||||
|
}
|
||||||
|
getFilterResultReplace(
|
||||||
|
&filter,
|
||||||
)
|
)
|
||||||
if want[0] != test.Want {
|
if filter.Results[0] != test.Want {
|
||||||
t.Errorf("Got %s, want %s", want[0], test.Want)
|
t.Errorf("Got %s, want %s", filter.Results, test.Want)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -170,14 +185,18 @@ func TestFilterMatch(t *testing.T) {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
testname := fmt.Sprintf("%s", test.Query)
|
testname := fmt.Sprintf("%s", test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
want := []string{}
|
filter := Filter{
|
||||||
getFilterResultMatch(
|
Parent: &Filter{
|
||||||
&Filter{
|
Results: []string{test.Input},
|
||||||
Var1: test.Query,
|
|
||||||
},
|
},
|
||||||
|
Var1: test.Query,
|
||||||
|
}
|
||||||
|
getFilterResultMatch(
|
||||||
|
&filter,
|
||||||
)
|
)
|
||||||
if !reflect.DeepEqual(test.Want, want) {
|
// len() thing cuz filterResults == nil and test.Want == [], same thing but not really...
|
||||||
t.Errorf("Got %s, want %s", want, test.Want)
|
if !(len(filter.Results) == 0 && len(test.Want) == 0) && !reflect.DeepEqual(filter.Results, test.Want) {
|
||||||
|
t.Errorf("Got %s, want %s", filter.Results, test.Want)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -217,14 +236,17 @@ func TestFilterSubstring(t *testing.T) {
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
testname := fmt.Sprintf("%s %s", test.Input, test.Query)
|
testname := fmt.Sprintf("%s %s", test.Input, test.Query)
|
||||||
t.Run(testname, func(t *testing.T) {
|
t.Run(testname, func(t *testing.T) {
|
||||||
want := []string{test.Want}
|
filter := Filter{
|
||||||
getFilterResultSubstring(
|
Parent: &Filter{
|
||||||
&Filter{
|
Results: []string{test.Input},
|
||||||
Var1: test.Query,
|
|
||||||
},
|
},
|
||||||
|
Var1: test.Query,
|
||||||
|
}
|
||||||
|
getFilterResultSubstring(
|
||||||
|
&filter,
|
||||||
)
|
)
|
||||||
if want[0] != test.Want {
|
if filter.Results[0] != test.Want {
|
||||||
t.Errorf("Got %s, want %s", want[0], test.Want)
|
t.Errorf("Got %s, want %s", filter.Results, test.Want)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue