got scraping working on filters
This commit is contained in:
parent
90ff748fa3
commit
13b7093349
6 changed files with 281 additions and 9 deletions
3
main.go
3
main.go
|
@ -72,6 +72,9 @@ func (web Web) watchView(c *gin.Context) {
|
|||
var connections []FilterConnection
|
||||
web.db.Model(&FilterConnection{}).Where("watch_id = ?", watch.ID).Find(&connections)
|
||||
|
||||
buildFilterTree(filters, connections)
|
||||
fillFilterResults(filters)
|
||||
|
||||
c.HTML(http.StatusOK, "watchView", gin.H{
|
||||
"Watch": watch,
|
||||
"Filters": filters,
|
||||
|
|
|
@ -12,7 +12,7 @@ type Filter struct {
|
|||
Name string `form:"filter_name" yaml:"filter_name" json:"filter_name" binding:"required" validate:"min=1"`
|
||||
X int `form:"x" yaml:"x" json:"x" validate:"default=0"`
|
||||
Y int `form:"y" yaml:"y" json:"y" validate:"default=0"`
|
||||
Type string `form:"filter_type" yaml:"filter_type" json:"filter_type" binding:"required" validate:"oneof=url xpath json css replace match substring"`
|
||||
Type string `form:"filter_type" yaml:"filter_type" json:"filter_type" binding:"required" validate:"oneof=url xpath json css replace match substring min max average count"`
|
||||
Var1 string `form:"var1" yaml:"var1" json:"var1" binding:"required"`
|
||||
Var2 *string `form:"var2" yaml:"var2" json:"var2"`
|
||||
Var3 *string `form:"var3" yaml:"var3" json:"var3"`
|
||||
|
|
130
scraping.go
130
scraping.go
|
@ -2,7 +2,11 @@ package main
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"math"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
@ -13,8 +17,37 @@ import (
|
|||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
func fillFilterResults(filters []Filter) {
|
||||
processedMap := make(map[uint]bool, len(filters))
|
||||
for len(filters) > 0 {
|
||||
filter := &filters[0]
|
||||
filters = filters[1:]
|
||||
var allParentsProcessed = true
|
||||
for _, parent := range filter.Parents {
|
||||
if _, contains := processedMap[parent.ID]; !contains {
|
||||
allParentsProcessed = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if !allParentsProcessed {
|
||||
filters = append(filters, *filter)
|
||||
continue
|
||||
}
|
||||
getFilterResult(filter)
|
||||
processedMap[filter.ID] = true
|
||||
}
|
||||
}
|
||||
|
||||
func getFilterResult(filter *Filter) {
|
||||
switch {
|
||||
case filter.Type == "gurl":
|
||||
{
|
||||
getFilterResultURL(filter)
|
||||
}
|
||||
case filter.Type == "gurls":
|
||||
{
|
||||
getFilterResultURL(filter)
|
||||
}
|
||||
case filter.Type == "xpath":
|
||||
{
|
||||
getFilterResultXPath(filter)
|
||||
|
@ -39,11 +72,42 @@ func getFilterResult(filter *Filter) {
|
|||
{
|
||||
getFilterResultSubstring(filter)
|
||||
}
|
||||
case filter.Type == "min":
|
||||
{
|
||||
getFilterResultMin(filter)
|
||||
}
|
||||
case filter.Type == "max":
|
||||
{
|
||||
getFilterResultMax(filter)
|
||||
}
|
||||
case filter.Type == "average":
|
||||
{
|
||||
getFilterResultAverage(filter)
|
||||
}
|
||||
case filter.Type == "count":
|
||||
{
|
||||
getFilterResultCount(filter)
|
||||
}
|
||||
default:
|
||||
|
||||
log.Println("getFilterResult called with filter.Type == ", filter.Type)
|
||||
}
|
||||
}
|
||||
|
||||
func getFilterResultURL(filter *Filter) {
|
||||
url := filter.Var1
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
log.Println("Could not fetch url", url)
|
||||
log.Println("Reason:", err)
|
||||
}
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Println("Could not fetch url", url)
|
||||
log.Println("Reason:", err)
|
||||
}
|
||||
filter.Results = append(filter.Results, string(body))
|
||||
}
|
||||
|
||||
func getFilterResultXPath(filter *Filter) {
|
||||
if filter.Parents == nil {
|
||||
log.Println("Filter", filter.Name, "called without parents for", filter.Type)
|
||||
|
@ -100,7 +164,6 @@ func getFilterResultCSS(filter *Filter) {
|
|||
for _, node := range cascadia.QueryAll(doc, sel) {
|
||||
var b bytes.Buffer
|
||||
html.Render(&b, node)
|
||||
log.Println(b.String())
|
||||
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
|
||||
}
|
||||
}
|
||||
|
@ -140,9 +203,7 @@ func getFilterResultMatch(filter *Filter) {
|
|||
}
|
||||
for _, parent := range filter.Parents {
|
||||
for _, result := range parent.Results {
|
||||
log.Println(">", result)
|
||||
for _, str := range r.FindAllString(result, -1) {
|
||||
log.Println(">>", str)
|
||||
filter.Results = append(filter.Results, str)
|
||||
}
|
||||
}
|
||||
|
@ -209,3 +270,64 @@ func getFilterResultSubstring(filter *Filter) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getFilterResultMin(filter *Filter) {
|
||||
var min = math.MaxFloat64
|
||||
var setMin = false
|
||||
for _, parent := range filter.Parents {
|
||||
for _, result := range parent.Results {
|
||||
if number, err := strconv.ParseFloat(result, 64); err == nil {
|
||||
if number < min {
|
||||
min = number
|
||||
setMin = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if setMin {
|
||||
filter.Results = append(filter.Results, fmt.Sprintf("%f", min))
|
||||
}
|
||||
}
|
||||
|
||||
func getFilterResultMax(filter *Filter) {
|
||||
var max = -math.MaxFloat64
|
||||
var setMax = false
|
||||
for _, parent := range filter.Parents {
|
||||
for _, result := range parent.Results {
|
||||
if number, err := strconv.ParseFloat(result, 64); err == nil {
|
||||
if number > max {
|
||||
max = number
|
||||
setMax = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if setMax {
|
||||
filter.Results = append(filter.Results, fmt.Sprintf("%f", max))
|
||||
}
|
||||
}
|
||||
|
||||
func getFilterResultAverage(filter *Filter) {
|
||||
var sum float64 = 0.0
|
||||
var count float64 = 0.0
|
||||
for _, parent := range filter.Parents {
|
||||
for _, result := range parent.Results {
|
||||
if number, err := strconv.ParseFloat(result, 64); err == nil {
|
||||
sum += number
|
||||
count++
|
||||
}
|
||||
}
|
||||
}
|
||||
filter.Results = append(filter.Results, fmt.Sprintf("%f", sum/count))
|
||||
}
|
||||
|
||||
func getFilterResultCount(filter *Filter) {
|
||||
var count = 0
|
||||
for _, parent := range filter.Parents {
|
||||
count += len(parent.Children)
|
||||
}
|
||||
log.Println(fmt.Sprintf("%d", count))
|
||||
filter.Results = append(filter.Results, fmt.Sprintf("%d", count))
|
||||
}
|
||||
|
|
|
@ -32,14 +32,47 @@ var __values = (this && this.__values) || function(o) {
|
|||
function onTypeChange() {
|
||||
var select = document.getElementById("typeInput");
|
||||
var type = select.value;
|
||||
var var1Div = document.getElementById("var1Div");
|
||||
var var1Input = document.getElementById("var1Input");
|
||||
var var1Label = document.getElementById("var1Label");
|
||||
var var2Div = document.getElementById("var2Div");
|
||||
var var2Input = document.getElementById("var2Input");
|
||||
var var2Label = document.getElementById("var2Label");
|
||||
var var3Div = document.getElementById("var3Div");
|
||||
var var3Input = document.getElementById("var3Input");
|
||||
var var3Label = document.getElementById("var3Label");
|
||||
switch (type) {
|
||||
case "gurl": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "URL";
|
||||
var1Input.placeholder = "https://shopping.website.com";
|
||||
var2Input.disabled = true;
|
||||
var2Input.placeholder = "";
|
||||
var2Label.innerHTML = "-";
|
||||
var3Input.disabled = true;
|
||||
var3Input.placeholder = "";
|
||||
var3Label.innerHTML = "-";
|
||||
break;
|
||||
}
|
||||
case "gurls": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "-";
|
||||
var1Input.placeholder = "From parents";
|
||||
var1Input.value = "-";
|
||||
var1Input.disabled = true;
|
||||
var2Input.disabled = true;
|
||||
var2Input.placeholder = "";
|
||||
var2Label.innerHTML = "-";
|
||||
var3Input.disabled = true;
|
||||
var3Input.placeholder = "";
|
||||
var3Label.innerHTML = "-";
|
||||
break;
|
||||
}
|
||||
case "xpath": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "XPath";
|
||||
var1Input.placeholder = "//a[@class='price";
|
||||
var2Input.disabled = true;
|
||||
|
@ -51,6 +84,8 @@ function onTypeChange() {
|
|||
break;
|
||||
}
|
||||
case "json": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "JSON";
|
||||
var1Input.placeholder = "products.#.price";
|
||||
var2Input.disabled = true;
|
||||
|
@ -60,6 +95,8 @@ function onTypeChange() {
|
|||
break;
|
||||
}
|
||||
case "css": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "Selector";
|
||||
var1Input.placeholder = ".price";
|
||||
var2Input.disabled = true;
|
||||
|
@ -69,6 +106,8 @@ function onTypeChange() {
|
|||
break;
|
||||
}
|
||||
case "replace": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "Regex";
|
||||
var1Input.placeholder = "So[mM]e(thing|where)";
|
||||
var2Input.disabled = false;
|
||||
|
@ -78,6 +117,8 @@ function onTypeChange() {
|
|||
break;
|
||||
}
|
||||
case "match": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "Regex";
|
||||
var1Input.placeholder = "So[mM]e(thing|where)";
|
||||
var2Input.disabled = true;
|
||||
|
@ -95,6 +136,36 @@ function onTypeChange() {
|
|||
var3Label.innerHTML = "-";
|
||||
break;
|
||||
}
|
||||
case "math": {
|
||||
var mathSelect = document.createElement("select");
|
||||
mathSelect.classList.add("form-control");
|
||||
var mathOptionMin = document.createElement("option");
|
||||
mathOptionMin.value = "min";
|
||||
mathOptionMin.innerHTML = "Min";
|
||||
mathSelect.appendChild(mathOptionMin);
|
||||
var mathOptionMax = document.createElement("option");
|
||||
mathOptionMax.value = "max";
|
||||
mathOptionMax.innerHTML = "Max";
|
||||
mathSelect.appendChild(mathOptionMax);
|
||||
var mathOptionAvg = document.createElement("option");
|
||||
mathOptionAvg.value = "average";
|
||||
mathOptionAvg.innerHTML = "Average";
|
||||
mathSelect.appendChild(mathOptionAvg);
|
||||
var mathOptionCount = document.createElement("option");
|
||||
mathOptionCount.value = "count";
|
||||
mathOptionCount.innerHTML = "Count";
|
||||
mathSelect.appendChild(mathOptionCount);
|
||||
var1Input.remove();
|
||||
var1Div.appendChild(mathSelect);
|
||||
var1Label.innerHTML = "Function";
|
||||
var2Input.disabled = true;
|
||||
var2Input.placeholder = "";
|
||||
var2Label.innerHTML = "-";
|
||||
var3Input.disabled = true;
|
||||
var3Input.placeholder = "";
|
||||
var3Label.innerHTML = "-";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
function onSubmitNewFilter() {
|
||||
|
|
|
@ -2,17 +2,50 @@ function onTypeChange(){
|
|||
let select = document.getElementById("typeInput") as HTMLSelectElement;
|
||||
let type = select.value;
|
||||
|
||||
let var1Div = document.getElementById("var1Div") as HTMLDivElement;
|
||||
let var1Input = document.getElementById("var1Input") as HTMLInputElement;
|
||||
let var1Label = document.getElementById("var1Label") as HTMLLabelElement;
|
||||
|
||||
let var2Div = document.getElementById("var2Div") as HTMLDivElement;
|
||||
let var2Input = document.getElementById("var2Input") as HTMLInputElement;
|
||||
let var2Label = document.getElementById("var2Label") as HTMLLabelElement;
|
||||
|
||||
|
||||
let var3Div = document.getElementById("var3Div") as HTMLDivElement;
|
||||
let var3Input = document.getElementById("var3Input") as HTMLInputElement;
|
||||
let var3Label = document.getElementById("var3Label") as HTMLLabelElement;
|
||||
|
||||
switch(type){
|
||||
case "gurl": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "URL";
|
||||
var1Input.placeholder = "https://shopping.website.com";
|
||||
var2Input.disabled = true;
|
||||
var2Input.placeholder = ""
|
||||
var2Label.innerHTML = "-";
|
||||
var3Input.disabled = true;
|
||||
var3Input.placeholder = ""
|
||||
var3Label.innerHTML = "-";
|
||||
break;
|
||||
}
|
||||
case "gurls": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "-";
|
||||
var1Input.placeholder = "From parents";
|
||||
var1Input.value = "-";
|
||||
var1Input.disabled = true;
|
||||
var2Input.disabled = true;
|
||||
var2Input.placeholder = ""
|
||||
var2Label.innerHTML = "-";
|
||||
var3Input.disabled = true;
|
||||
var3Input.placeholder = ""
|
||||
var3Label.innerHTML = "-";
|
||||
break;
|
||||
}
|
||||
case "xpath": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "XPath";
|
||||
var1Input.placeholder = "//a[@class='price";
|
||||
var2Input.disabled = true;
|
||||
|
@ -24,6 +57,8 @@ function onTypeChange(){
|
|||
break;
|
||||
}
|
||||
case "json": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "JSON";
|
||||
var1Input.placeholder = "products.#.price";
|
||||
var2Input.disabled = true;
|
||||
|
@ -33,6 +68,8 @@ function onTypeChange(){
|
|||
break;
|
||||
}
|
||||
case "css": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "Selector";
|
||||
var1Input.placeholder = ".price";
|
||||
var2Input.disabled = true;
|
||||
|
@ -42,6 +79,8 @@ function onTypeChange(){
|
|||
break;
|
||||
}
|
||||
case "replace": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "Regex";
|
||||
var1Input.placeholder = "So[mM]e(thing|where)";
|
||||
var2Input.disabled = false;
|
||||
|
@ -51,6 +90,8 @@ function onTypeChange(){
|
|||
break;
|
||||
}
|
||||
case "match": {
|
||||
var1Div.innerHTML = "";
|
||||
var1Div.appendChild(var1Input);
|
||||
var1Label.innerHTML = "Regex";
|
||||
var1Input.placeholder = "So[mM]e(thing|where)";
|
||||
var2Input.disabled = true;
|
||||
|
@ -68,6 +109,38 @@ function onTypeChange(){
|
|||
var3Label.innerHTML = "-";
|
||||
break;
|
||||
}
|
||||
case "math": {
|
||||
let mathSelect = document.createElement("select");
|
||||
mathSelect.classList.add("form-control");
|
||||
let mathOptionMin = document.createElement("option");
|
||||
mathOptionMin.value = "min"
|
||||
mathOptionMin.innerHTML = "Min";
|
||||
mathSelect.appendChild(mathOptionMin);
|
||||
let mathOptionMax = document.createElement("option")
|
||||
mathOptionMax.value = "max";
|
||||
mathOptionMax.innerHTML = "Max";
|
||||
mathSelect.appendChild(mathOptionMax);
|
||||
let mathOptionAvg = document.createElement("option")
|
||||
mathOptionAvg.value = "average";
|
||||
mathOptionAvg.innerHTML = "Average";
|
||||
mathSelect.appendChild(mathOptionAvg);
|
||||
let mathOptionCount = document.createElement("option")
|
||||
mathOptionCount.value = "count";
|
||||
mathOptionCount.innerHTML = "Count";
|
||||
mathSelect.appendChild(mathOptionCount);
|
||||
|
||||
var1Input.remove();
|
||||
var1Div.appendChild(mathSelect);
|
||||
|
||||
var1Label.innerHTML = "Function";
|
||||
var2Input.disabled = true;
|
||||
var2Input.placeholder = ""
|
||||
var2Label.innerHTML = "-";
|
||||
var3Input.disabled = true;
|
||||
var3Input.placeholder = ""
|
||||
var3Label.innerHTML = "-";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -37,24 +37,27 @@
|
|||
<div class="col-sm-10 p-2">
|
||||
<select id="typeInput" class="form-control" name="type">
|
||||
<option value="xpath" selected="true">XPath</option>
|
||||
<option value="gurl">Get URL</option>
|
||||
<option value="gurls">Get URLs</option>
|
||||
<option value="json">JSON</option>
|
||||
<option value="css">CSS</option>
|
||||
<option value="replace">Replace</option>
|
||||
<option value="match">Match</option>
|
||||
<option value="substring">Substring</option>
|
||||
<option value="math">Math</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<label for="var1" id="var1Label" class="col-sm-2 col-form-label">XPath:</label>
|
||||
<div class="col-sm-10 p-2">
|
||||
<div class="col-sm-10 p-2" id="var1Div">
|
||||
<input type="text" class="form-control" name="var1" id="var1Input" placeholder="//a[@class='price']">
|
||||
</div>
|
||||
<label for="var2" id="var2Label" class="col-sm-2 col-form-label">-</label>
|
||||
<div class="col-sm-10 p-2">
|
||||
<div class="col-sm-10 p-2" id="var2Div">
|
||||
<input type="text" class="form-control" name="var2" id="var2Input" placeholder="" disabled>
|
||||
</div>
|
||||
<label for="var3" id="var3Label" class="col-sm-2 col-form-label">-</label>
|
||||
<div class="col-sm-10 p-2">
|
||||
<div class="col-sm-10 p-2" id="var3Div">
|
||||
<input type="text" class="form-control" name="var3" id="var3Input" placeholder="" disabled>
|
||||
</div>
|
||||
</div>
|
||||
|
|
Loading…
Add table
Reference in a new issue