got scraping working on filters

This commit is contained in:
BroodjeAap 2022-09-26 19:49:31 +00:00
parent 90ff748fa3
commit 13b7093349
6 changed files with 281 additions and 9 deletions

View file

@ -72,6 +72,9 @@ func (web Web) watchView(c *gin.Context) {
var connections []FilterConnection
web.db.Model(&FilterConnection{}).Where("watch_id = ?", watch.ID).Find(&connections)
buildFilterTree(filters, connections)
fillFilterResults(filters)
c.HTML(http.StatusOK, "watchView", gin.H{
"Watch": watch,
"Filters": filters,

View file

@ -12,7 +12,7 @@ type Filter struct {
Name string `form:"filter_name" yaml:"filter_name" json:"filter_name" binding:"required" validate:"min=1"`
X int `form:"x" yaml:"x" json:"x" validate:"default=0"`
Y int `form:"y" yaml:"y" json:"y" validate:"default=0"`
Type string `form:"filter_type" yaml:"filter_type" json:"filter_type" binding:"required" validate:"oneof=url xpath json css replace match substring"`
Type string `form:"filter_type" yaml:"filter_type" json:"filter_type" binding:"required" validate:"oneof=url xpath json css replace match substring min max average count"`
Var1 string `form:"var1" yaml:"var1" json:"var1" binding:"required"`
Var2 *string `form:"var2" yaml:"var2" json:"var2"`
Var3 *string `form:"var3" yaml:"var3" json:"var3"`

View file

@ -2,7 +2,11 @@ package main
import (
"bytes"
"fmt"
"io/ioutil"
"log"
"math"
"net/http"
"regexp"
"strconv"
"strings"
@ -13,8 +17,37 @@ import (
"golang.org/x/net/html"
)
func fillFilterResults(filters []Filter) {
processedMap := make(map[uint]bool, len(filters))
for len(filters) > 0 {
filter := &filters[0]
filters = filters[1:]
var allParentsProcessed = true
for _, parent := range filter.Parents {
if _, contains := processedMap[parent.ID]; !contains {
allParentsProcessed = false
break
}
}
if !allParentsProcessed {
filters = append(filters, *filter)
continue
}
getFilterResult(filter)
processedMap[filter.ID] = true
}
}
func getFilterResult(filter *Filter) {
switch {
case filter.Type == "gurl":
{
getFilterResultURL(filter)
}
case filter.Type == "gurls":
{
getFilterResultURL(filter)
}
case filter.Type == "xpath":
{
getFilterResultXPath(filter)
@ -39,11 +72,42 @@ func getFilterResult(filter *Filter) {
{
getFilterResultSubstring(filter)
}
case filter.Type == "min":
{
getFilterResultMin(filter)
}
case filter.Type == "max":
{
getFilterResultMax(filter)
}
case filter.Type == "average":
{
getFilterResultAverage(filter)
}
case filter.Type == "count":
{
getFilterResultCount(filter)
}
default:
log.Println("getFilterResult called with filter.Type == ", filter.Type)
}
}
func getFilterResultURL(filter *Filter) {
url := filter.Var1
resp, err := http.Get(url)
if err != nil {
log.Println("Could not fetch url", url)
log.Println("Reason:", err)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Println("Could not fetch url", url)
log.Println("Reason:", err)
}
filter.Results = append(filter.Results, string(body))
}
func getFilterResultXPath(filter *Filter) {
if filter.Parents == nil {
log.Println("Filter", filter.Name, "called without parents for", filter.Type)
@ -100,7 +164,6 @@ func getFilterResultCSS(filter *Filter) {
for _, node := range cascadia.QueryAll(doc, sel) {
var b bytes.Buffer
html.Render(&b, node)
log.Println(b.String())
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
}
}
@ -140,9 +203,7 @@ func getFilterResultMatch(filter *Filter) {
}
for _, parent := range filter.Parents {
for _, result := range parent.Results {
log.Println(">", result)
for _, str := range r.FindAllString(result, -1) {
log.Println(">>", str)
filter.Results = append(filter.Results, str)
}
}
@ -209,3 +270,64 @@ func getFilterResultSubstring(filter *Filter) {
}
}
}
func getFilterResultMin(filter *Filter) {
var min = math.MaxFloat64
var setMin = false
for _, parent := range filter.Parents {
for _, result := range parent.Results {
if number, err := strconv.ParseFloat(result, 64); err == nil {
if number < min {
min = number
setMin = true
}
}
}
}
if setMin {
filter.Results = append(filter.Results, fmt.Sprintf("%f", min))
}
}
func getFilterResultMax(filter *Filter) {
var max = -math.MaxFloat64
var setMax = false
for _, parent := range filter.Parents {
for _, result := range parent.Results {
if number, err := strconv.ParseFloat(result, 64); err == nil {
if number > max {
max = number
setMax = true
}
}
}
}
if setMax {
filter.Results = append(filter.Results, fmt.Sprintf("%f", max))
}
}
func getFilterResultAverage(filter *Filter) {
var sum float64 = 0.0
var count float64 = 0.0
for _, parent := range filter.Parents {
for _, result := range parent.Results {
if number, err := strconv.ParseFloat(result, 64); err == nil {
sum += number
count++
}
}
}
filter.Results = append(filter.Results, fmt.Sprintf("%f", sum/count))
}
func getFilterResultCount(filter *Filter) {
var count = 0
for _, parent := range filter.Parents {
count += len(parent.Children)
}
log.Println(fmt.Sprintf("%d", count))
filter.Results = append(filter.Results, fmt.Sprintf("%d", count))
}

View file

@ -32,14 +32,47 @@ var __values = (this && this.__values) || function(o) {
function onTypeChange() {
var select = document.getElementById("typeInput");
var type = select.value;
var var1Div = document.getElementById("var1Div");
var var1Input = document.getElementById("var1Input");
var var1Label = document.getElementById("var1Label");
var var2Div = document.getElementById("var2Div");
var var2Input = document.getElementById("var2Input");
var var2Label = document.getElementById("var2Label");
var var3Div = document.getElementById("var3Div");
var var3Input = document.getElementById("var3Input");
var var3Label = document.getElementById("var3Label");
switch (type) {
case "gurl": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "URL";
var1Input.placeholder = "https://shopping.website.com";
var2Input.disabled = true;
var2Input.placeholder = "";
var2Label.innerHTML = "-";
var3Input.disabled = true;
var3Input.placeholder = "";
var3Label.innerHTML = "-";
break;
}
case "gurls": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "-";
var1Input.placeholder = "From parents";
var1Input.value = "-";
var1Input.disabled = true;
var2Input.disabled = true;
var2Input.placeholder = "";
var2Label.innerHTML = "-";
var3Input.disabled = true;
var3Input.placeholder = "";
var3Label.innerHTML = "-";
break;
}
case "xpath": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "XPath";
var1Input.placeholder = "//a[@class='price";
var2Input.disabled = true;
@ -51,6 +84,8 @@ function onTypeChange() {
break;
}
case "json": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "JSON";
var1Input.placeholder = "products.#.price";
var2Input.disabled = true;
@ -60,6 +95,8 @@ function onTypeChange() {
break;
}
case "css": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "Selector";
var1Input.placeholder = ".price";
var2Input.disabled = true;
@ -69,6 +106,8 @@ function onTypeChange() {
break;
}
case "replace": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "Regex";
var1Input.placeholder = "So[mM]e(thing|where)";
var2Input.disabled = false;
@ -78,6 +117,8 @@ function onTypeChange() {
break;
}
case "match": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "Regex";
var1Input.placeholder = "So[mM]e(thing|where)";
var2Input.disabled = true;
@ -95,6 +136,36 @@ function onTypeChange() {
var3Label.innerHTML = "-";
break;
}
case "math": {
var mathSelect = document.createElement("select");
mathSelect.classList.add("form-control");
var mathOptionMin = document.createElement("option");
mathOptionMin.value = "min";
mathOptionMin.innerHTML = "Min";
mathSelect.appendChild(mathOptionMin);
var mathOptionMax = document.createElement("option");
mathOptionMax.value = "max";
mathOptionMax.innerHTML = "Max";
mathSelect.appendChild(mathOptionMax);
var mathOptionAvg = document.createElement("option");
mathOptionAvg.value = "average";
mathOptionAvg.innerHTML = "Average";
mathSelect.appendChild(mathOptionAvg);
var mathOptionCount = document.createElement("option");
mathOptionCount.value = "count";
mathOptionCount.innerHTML = "Count";
mathSelect.appendChild(mathOptionCount);
var1Input.remove();
var1Div.appendChild(mathSelect);
var1Label.innerHTML = "Function";
var2Input.disabled = true;
var2Input.placeholder = "";
var2Label.innerHTML = "-";
var3Input.disabled = true;
var3Input.placeholder = "";
var3Label.innerHTML = "-";
break;
}
}
}
function onSubmitNewFilter() {

View file

@ -2,17 +2,50 @@ function onTypeChange(){
let select = document.getElementById("typeInput") as HTMLSelectElement;
let type = select.value;
let var1Div = document.getElementById("var1Div") as HTMLDivElement;
let var1Input = document.getElementById("var1Input") as HTMLInputElement;
let var1Label = document.getElementById("var1Label") as HTMLLabelElement;
let var2Div = document.getElementById("var2Div") as HTMLDivElement;
let var2Input = document.getElementById("var2Input") as HTMLInputElement;
let var2Label = document.getElementById("var2Label") as HTMLLabelElement;
let var3Div = document.getElementById("var3Div") as HTMLDivElement;
let var3Input = document.getElementById("var3Input") as HTMLInputElement;
let var3Label = document.getElementById("var3Label") as HTMLLabelElement;
switch(type){
case "gurl": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "URL";
var1Input.placeholder = "https://shopping.website.com";
var2Input.disabled = true;
var2Input.placeholder = ""
var2Label.innerHTML = "-";
var3Input.disabled = true;
var3Input.placeholder = ""
var3Label.innerHTML = "-";
break;
}
case "gurls": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "-";
var1Input.placeholder = "From parents";
var1Input.value = "-";
var1Input.disabled = true;
var2Input.disabled = true;
var2Input.placeholder = ""
var2Label.innerHTML = "-";
var3Input.disabled = true;
var3Input.placeholder = ""
var3Label.innerHTML = "-";
break;
}
case "xpath": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "XPath";
var1Input.placeholder = "//a[@class='price";
var2Input.disabled = true;
@ -24,6 +57,8 @@ function onTypeChange(){
break;
}
case "json": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "JSON";
var1Input.placeholder = "products.#.price";
var2Input.disabled = true;
@ -33,6 +68,8 @@ function onTypeChange(){
break;
}
case "css": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "Selector";
var1Input.placeholder = ".price";
var2Input.disabled = true;
@ -42,6 +79,8 @@ function onTypeChange(){
break;
}
case "replace": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "Regex";
var1Input.placeholder = "So[mM]e(thing|where)";
var2Input.disabled = false;
@ -51,6 +90,8 @@ function onTypeChange(){
break;
}
case "match": {
var1Div.innerHTML = "";
var1Div.appendChild(var1Input);
var1Label.innerHTML = "Regex";
var1Input.placeholder = "So[mM]e(thing|where)";
var2Input.disabled = true;
@ -68,6 +109,38 @@ function onTypeChange(){
var3Label.innerHTML = "-";
break;
}
case "math": {
let mathSelect = document.createElement("select");
mathSelect.classList.add("form-control");
let mathOptionMin = document.createElement("option");
mathOptionMin.value = "min"
mathOptionMin.innerHTML = "Min";
mathSelect.appendChild(mathOptionMin);
let mathOptionMax = document.createElement("option")
mathOptionMax.value = "max";
mathOptionMax.innerHTML = "Max";
mathSelect.appendChild(mathOptionMax);
let mathOptionAvg = document.createElement("option")
mathOptionAvg.value = "average";
mathOptionAvg.innerHTML = "Average";
mathSelect.appendChild(mathOptionAvg);
let mathOptionCount = document.createElement("option")
mathOptionCount.value = "count";
mathOptionCount.innerHTML = "Count";
mathSelect.appendChild(mathOptionCount);
var1Input.remove();
var1Div.appendChild(mathSelect);
var1Label.innerHTML = "Function";
var2Input.disabled = true;
var2Input.placeholder = ""
var2Label.innerHTML = "-";
var3Input.disabled = true;
var3Input.placeholder = ""
var3Label.innerHTML = "-";
break;
}
}
}

View file

@ -37,24 +37,27 @@
<div class="col-sm-10 p-2">
<select id="typeInput" class="form-control" name="type">
<option value="xpath" selected="true">XPath</option>
<option value="gurl">Get URL</option>
<option value="gurls">Get URLs</option>
<option value="json">JSON</option>
<option value="css">CSS</option>
<option value="replace">Replace</option>
<option value="match">Match</option>
<option value="substring">Substring</option>
<option value="math">Math</option>
</select>
</div>
<label for="var1" id="var1Label" class="col-sm-2 col-form-label">XPath:</label>
<div class="col-sm-10 p-2">
<div class="col-sm-10 p-2" id="var1Div">
<input type="text" class="form-control" name="var1" id="var1Input" placeholder="//a[@class='price']">
</div>
<label for="var2" id="var2Label" class="col-sm-2 col-form-label">-</label>
<div class="col-sm-10 p-2">
<div class="col-sm-10 p-2" id="var2Div">
<input type="text" class="form-control" name="var2" id="var2Input" placeholder="" disabled>
</div>
<label for="var3" id="var3Label" class="col-sm-2 col-form-label">-</label>
<div class="col-sm-10 p-2">
<div class="col-sm-10 p-2" id="var3Div">
<input type="text" class="form-control" name="var3" id="var3Input" placeholder="" disabled>
</div>
</div>