go-watch/scraping.go

184 lines
4.1 KiB
Go

package main
import (
"bytes"
"io/ioutil"
"log"
"net/http"
"regexp"
"strconv"
"strings"
"github.com/andybalholm/cascadia"
"github.com/antchfx/htmlquery"
"github.com/tidwall/gjson"
"golang.org/x/net/html"
)
func getGroupResult(group *FilterGroup) []string {
resp, err := http.Get(group.URL.URL)
if err != nil {
log.Print("Something went wrong loading", group.URL.URL)
return []string{}
}
defer resp.Body.Close()
html, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Print("Something went wrong loading ", group.URL.URL)
return []string{}
}
resultStrings := []string{string(html)}
newStrings := []string{}
for _, filter := range group.Filters {
for _, resultString := range resultStrings {
getFilterResult(resultString, &filter, &newStrings)
}
resultStrings = newStrings
newStrings = nil
}
return resultStrings
}
func getFilterResult(s string, filter *Filter, newStrings *[]string) {
switch {
case filter.Type == "xpath":
{
getFilterResultXPath(s, filter, newStrings)
}
case filter.Type == "json":
{
getFilterResultJSON(s, filter, newStrings)
}
case filter.Type == "css":
{
getFilterResultCSS(s, filter, newStrings)
}
case filter.Type == "replace":
{
getFilterResultReplace(s, filter, newStrings)
}
case filter.Type == "match":
{
getFilterResultMatch(s, filter, newStrings)
}
case filter.Type == "substring":
{
getFilterResultSubstring(s, filter, newStrings)
}
default:
}
}
func getFilterResultXPath(s string, filter *Filter, newStrings *[]string) {
doc, err := htmlquery.Parse(strings.NewReader(s))
if err != nil {
log.Print(err)
return
}
nodes, _ := htmlquery.QueryAll(doc, filter.From)
for _, node := range nodes {
var b bytes.Buffer
html.Render(&b, node)
*newStrings = append(*newStrings, html.UnescapeString(b.String()))
}
}
func getFilterResultJSON(s string, filter *Filter, newStrings *[]string) {
for _, result := range gjson.Get(s, filter.From).Array() {
*newStrings = append(*newStrings, result.String())
}
}
func getFilterResultCSS(s string, filter *Filter, newStrings *[]string) {
doc, err := html.Parse(strings.NewReader(s))
if err != nil {
log.Print(err)
return
}
sel, err := cascadia.Parse(filter.From)
if err != nil {
log.Print(err)
return
}
for _, node := range cascadia.QueryAll(doc, sel) {
var b bytes.Buffer
html.Render(&b, node)
*newStrings = append(*newStrings, html.UnescapeString(b.String()))
}
}
func getFilterResultReplace(s string, filter *Filter, newStrings *[]string) {
r, err := regexp.Compile(filter.From)
if err != nil {
log.Print(err)
return
}
*newStrings = append(*newStrings, r.ReplaceAllString(s, filter.To))
}
func getFilterResultMatch(s string, filter *Filter, newStrings *[]string) {
r, err := regexp.Compile(filter.From)
if err != nil {
log.Print(err)
return
}
for _, str := range r.FindAllString(s, -1) {
*newStrings = append(*newStrings, str)
}
}
func getFilterResultSubstring(s string, filter *Filter, newStrings *[]string) {
substrings := strings.Split(filter.From, ",")
var sb strings.Builder
asRunes := []rune(s)
for _, substring := range substrings {
if strings.Contains(substring, ":") {
from_to := strings.Split(substring, ":")
if len(from_to) != 2 {
return
}
fromStr := from_to[0]
var hasFrom bool = true
if fromStr == "" {
hasFrom = false
}
from64, err := strconv.ParseInt(fromStr, 10, 32)
var from = int(from64)
if hasFrom && err != nil {
return
} else if from < 0 {
from = len(asRunes) + from
}
toStr := from_to[1]
var hasTo bool = true
if toStr == "" {
hasTo = false
}
to64, err := strconv.ParseInt(toStr, 10, 32)
var to = int(to64)
if hasTo && err != nil {
return
} else if to < 0 {
to = len(asRunes) + to
}
if hasFrom && hasTo {
sb.WriteString(string(asRunes[from:to]))
} else if hasFrom {
sb.WriteString(string(asRunes[from:]))
} else if hasTo {
sb.WriteString(string(asRunes[:to]))
}
} else {
pos, err := strconv.ParseInt(substring, 10, 32)
if err != nil || pos < 0 {
return
}
sb.WriteRune(asRunes[pos])
}
}
*newStrings = append(*newStrings, sb.String())
}