split get url(s) into two functions one 'normal' and one browserless get
This commit is contained in:
parent
a8ddee70fb
commit
6057506f7b
5 changed files with 225 additions and 48 deletions
147
scraping.go
147
scraping.go
|
@ -3,6 +3,7 @@ package main
|
|||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"html/template"
|
||||
"io/ioutil"
|
||||
|
@ -126,6 +127,14 @@ func getFilterResult(filters []Filter, filter *Filter, watch *Watch, web *Web, d
|
|||
{
|
||||
getFilterResultURLs(filter, web.urlCache, debug)
|
||||
}
|
||||
case filter.Type == "bgurl":
|
||||
{
|
||||
getFilterResultBrowserlessURL(filter, web.urlCache, debug)
|
||||
}
|
||||
case filter.Type == "bgurls":
|
||||
{
|
||||
getFilterResultBrowserlessURLs(filter, web.urlCache, debug)
|
||||
}
|
||||
case filter.Type == "xpath":
|
||||
{
|
||||
getFilterResultXPath(filter)
|
||||
|
@ -250,6 +259,8 @@ func getFilterResultURL(filter *Filter, urlCache map[string]string, debug bool)
|
|||
}
|
||||
str, err := getURLContent(filter, fetchURL)
|
||||
if err != nil {
|
||||
log.Println("Could not fetch url: ", fetchURL, " - ", err)
|
||||
filter.log("Could not fetch url: ", fetchURL, " - ", err)
|
||||
return
|
||||
}
|
||||
filter.Results = append(filter.Results, str)
|
||||
|
@ -270,6 +281,8 @@ func getFilterResultURLs(filter *Filter, urlCache map[string]string, debug bool)
|
|||
|
||||
str, err := getURLContent(filter, fetchURL)
|
||||
if err != nil {
|
||||
log.Println("Could not fetch url: ", fetchURL, " - ", err)
|
||||
filter.log("Could not fetch url: ", fetchURL, " - ", err)
|
||||
continue
|
||||
}
|
||||
filter.Results = append(filter.Results, str)
|
||||
|
@ -281,62 +294,102 @@ func getFilterResultURLs(filter *Filter, urlCache map[string]string, debug bool)
|
|||
}
|
||||
|
||||
func getURLContent(filter *Filter, fetchURL string) (string, error) {
|
||||
var body []byte
|
||||
if viper.IsSet("browserless.url") {
|
||||
browserlessURL := viper.GetString("browserless.url")
|
||||
data := struct {
|
||||
URL string `json:"url"`
|
||||
}{
|
||||
URL: fetchURL,
|
||||
}
|
||||
jsn, err := json.Marshal(data)
|
||||
var httpClient *http.Client
|
||||
if viper.IsSet("proxy.proxy_url") {
|
||||
proxyUrl, err := url.Parse(viper.GetString("proxy.proxy_url"))
|
||||
if err != nil {
|
||||
log.Println("Could not marshal url:", err)
|
||||
filter.log("Could not marshal url:", err)
|
||||
return "", err
|
||||
}
|
||||
resp, err := http.Post(browserlessURL, "application/json", bytes.NewBuffer(jsn))
|
||||
if err != nil {
|
||||
log.Println("Could not get browserless response content:", err)
|
||||
filter.log("Could not get browserless response content:", err)
|
||||
return "", err
|
||||
}
|
||||
body, err = ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Println("Could not fetch url through browserless: ", fetchURL, " - ", err)
|
||||
filter.log("Could not fetch url through browserless: ", fetchURL, " - ", err)
|
||||
return "", err
|
||||
}
|
||||
httpClient = &http.Client{Transport: &http.Transport{Proxy: http.ProxyURL(proxyUrl)}}
|
||||
} else {
|
||||
var httpClient *http.Client
|
||||
if viper.IsSet("proxy.proxy_url") {
|
||||
proxyUrl, err := url.Parse(viper.GetString("proxy.proxy_url"))
|
||||
if err != nil {
|
||||
log.Println("Could not parse proxy url, check config")
|
||||
filter.log("Could not parse proxy url, check config")
|
||||
return "", err
|
||||
}
|
||||
httpClient = &http.Client{Transport: &http.Transport{Proxy: http.ProxyURL(proxyUrl)}}
|
||||
} else {
|
||||
httpClient = &http.Client{}
|
||||
}
|
||||
resp, err := httpClient.Get(fetchURL)
|
||||
if err != nil {
|
||||
log.Println("Could not fetch url: ", fetchURL, " - ", err)
|
||||
filter.log("Could not fetch url: ", fetchURL, " - ", err)
|
||||
return "", err
|
||||
}
|
||||
body, err = ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Println("Could not fetch url: ", fetchURL, " - ", err)
|
||||
filter.log("Could not fetch url: ", fetchURL, " - ", err)
|
||||
return "", err
|
||||
}
|
||||
httpClient = &http.Client{}
|
||||
}
|
||||
resp, err := httpClient.Get(fetchURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func getFilterResultBrowserlessURL(filter *Filter, urlCache map[string]string, debug bool) {
|
||||
fetchURL := filter.Var1
|
||||
val, exists := urlCache["b"+fetchURL]
|
||||
if debug && exists {
|
||||
filter.Results = append(filter.Results, val)
|
||||
return
|
||||
}
|
||||
str, err := getBrowserlessURLContent(filter, fetchURL)
|
||||
if err != nil {
|
||||
log.Println("Could not fetch url: ", fetchURL, " - ", err)
|
||||
filter.log("Could not fetch url: ", fetchURL, " - ", err)
|
||||
return
|
||||
}
|
||||
filter.Results = append(filter.Results, str)
|
||||
if debug {
|
||||
urlCache["b"+fetchURL] = str
|
||||
}
|
||||
}
|
||||
|
||||
func getFilterResultBrowserlessURLs(filter *Filter, urlCache map[string]string, debug bool) {
|
||||
for _, parent := range filter.Parents {
|
||||
for _, result := range parent.Results {
|
||||
fetchURL := result
|
||||
val, exists := urlCache["b"+fetchURL]
|
||||
if debug && exists {
|
||||
filter.Results = append(filter.Results, val)
|
||||
continue
|
||||
}
|
||||
|
||||
str, err := getBrowserlessURLContent(filter, fetchURL)
|
||||
if err != nil {
|
||||
log.Println("Could not fetch url: ", fetchURL, " - ", err)
|
||||
filter.log("Could not fetch url: ", fetchURL, " - ", err)
|
||||
continue
|
||||
}
|
||||
filter.Results = append(filter.Results, str)
|
||||
if debug {
|
||||
urlCache["b"+fetchURL] = str
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getBrowserlessURLContent(filter *Filter, fetchURL string) (string, error) {
|
||||
if !viper.IsSet("browserless.url") {
|
||||
return "", errors.New("browserless.url not set")
|
||||
}
|
||||
browserlessURL := viper.GetString("browserless.url")
|
||||
data := struct {
|
||||
URL string `json:"url"`
|
||||
}{
|
||||
URL: fetchURL,
|
||||
}
|
||||
jsn, err := json.Marshal(data)
|
||||
if err != nil {
|
||||
log.Println("Could not marshal url:", err)
|
||||
filter.log("Could not marshal url:", err)
|
||||
return "", err
|
||||
}
|
||||
resp, err := http.Post(browserlessURL, "application/json", bytes.NewBuffer(jsn))
|
||||
if err != nil {
|
||||
log.Println("Could not get browserless response content:", err)
|
||||
filter.log("Could not get browserless response content:", err)
|
||||
return "", err
|
||||
}
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Println("Could not fetch url through browserless: ", fetchURL, " - ", err)
|
||||
filter.log("Could not fetch url through browserless: ", fetchURL, " - ", err)
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func getFilterResultXPath(filter *Filter) {
|
||||
selectType := "node"
|
||||
if filter.Var2 != nil {
|
||||
|
|
|
@ -114,6 +114,65 @@ function onTypeChange(node) {
|
|||
var3Div.appendChild(var3Input);
|
||||
break;
|
||||
}
|
||||
case "bgurl": {
|
||||
var var1Input = document.createElement("input");
|
||||
var1Input.name = "var1";
|
||||
var1Input.id = "var1Input";
|
||||
var1Input.value = var1Value;
|
||||
var1Input.classList.add("form-control");
|
||||
var1Label.innerHTML = "URL";
|
||||
var1Input.placeholder = "https://shopping.website.com";
|
||||
var1Div.appendChild(var1Input);
|
||||
var var2Input = document.createElement("input");
|
||||
var2Input.name = "var2";
|
||||
var2Input.id = "var2Input";
|
||||
var2Input.value = var2Value;
|
||||
var2Input.classList.add("form-control");
|
||||
var2Input.disabled = true;
|
||||
var2Input.placeholder = "";
|
||||
var2Label.innerHTML = "-";
|
||||
var2Div.appendChild(var2Input);
|
||||
var var3Input = document.createElement("input");
|
||||
var3Input.name = "var3";
|
||||
var3Input.id = "var3Input";
|
||||
var3Input.value = var3Value;
|
||||
var3Input.classList.add("form-control");
|
||||
var3Input.disabled = true;
|
||||
var3Input.placeholder = "";
|
||||
var3Label.innerHTML = "-";
|
||||
var3Div.appendChild(var3Input);
|
||||
break;
|
||||
}
|
||||
case "bgurls": {
|
||||
var var1Input = document.createElement("input");
|
||||
var1Input.name = "var1";
|
||||
var1Input.id = "var1Input";
|
||||
var1Input.value = var1Value;
|
||||
var1Input.classList.add("form-control");
|
||||
var1Label.innerHTML = "-";
|
||||
var1Input.placeholder = "From parents";
|
||||
var1Input.disabled = true;
|
||||
var1Div.appendChild(var1Input);
|
||||
var var2Input = document.createElement("input");
|
||||
var2Input.name = "var2";
|
||||
var2Input.id = "var2Input";
|
||||
var2Input.value = var2Value;
|
||||
var2Input.classList.add("form-control");
|
||||
var2Input.disabled = true;
|
||||
var2Input.placeholder = "";
|
||||
var2Label.innerHTML = "-";
|
||||
var2Div.appendChild(var2Input);
|
||||
var var3Input = document.createElement("input");
|
||||
var3Input.name = "var3";
|
||||
var3Input.id = "var3Input";
|
||||
var3Input.value = var3Value;
|
||||
var3Input.classList.add("form-control");
|
||||
var3Input.disabled = true;
|
||||
var3Input.placeholder = "";
|
||||
var3Label.innerHTML = "-";
|
||||
var3Div.appendChild(var3Input);
|
||||
break;
|
||||
}
|
||||
case "xpath": {
|
||||
var var1Input = document.createElement("input");
|
||||
var1Input.name = "var1";
|
||||
|
|
|
@ -90,6 +90,69 @@ function onTypeChange(node: DiagramNode | null = null){
|
|||
var3Div.appendChild(var3Input);
|
||||
break;
|
||||
}
|
||||
case "bgurl": {
|
||||
let var1Input = document.createElement("input");
|
||||
var1Input.name = "var1";
|
||||
var1Input.id = "var1Input";
|
||||
var1Input.value = var1Value;
|
||||
var1Input.classList.add("form-control");
|
||||
var1Label.innerHTML = "URL";
|
||||
var1Input.placeholder = "https://shopping.website.com";
|
||||
var1Div.appendChild(var1Input);
|
||||
|
||||
let var2Input = document.createElement("input");
|
||||
var2Input.name = "var2";
|
||||
var2Input.id = "var2Input";
|
||||
var2Input.value = var2Value;
|
||||
var2Input.classList.add("form-control");
|
||||
var2Input.disabled = true;
|
||||
var2Input.placeholder = ""
|
||||
var2Label.innerHTML = "-";
|
||||
var2Div.appendChild(var2Input);
|
||||
|
||||
let var3Input = document.createElement("input");
|
||||
var3Input.name = "var3";
|
||||
var3Input.id = "var3Input";
|
||||
var3Input.value = var3Value;
|
||||
var3Input.classList.add("form-control");
|
||||
var3Input.disabled = true;
|
||||
var3Input.placeholder = ""
|
||||
var3Label.innerHTML = "-";
|
||||
var3Div.appendChild(var3Input);
|
||||
break;
|
||||
}
|
||||
case "bgurls": {
|
||||
let var1Input = document.createElement("input");
|
||||
var1Input.name = "var1";
|
||||
var1Input.id = "var1Input";
|
||||
var1Input.value = var1Value;
|
||||
var1Input.classList.add("form-control")
|
||||
var1Label.innerHTML = "-";
|
||||
var1Input.placeholder = "From parents";
|
||||
var1Input.disabled = true;
|
||||
var1Div.appendChild(var1Input);
|
||||
|
||||
let var2Input = document.createElement("input");
|
||||
var2Input.name = "var2";
|
||||
var2Input.id = "var2Input";
|
||||
var2Input.value = var2Value;
|
||||
var2Input.classList.add("form-control")
|
||||
var2Input.disabled = true;
|
||||
var2Input.placeholder = ""
|
||||
var2Label.innerHTML = "-";
|
||||
var2Div.appendChild(var2Input);
|
||||
|
||||
let var3Input = document.createElement("input");
|
||||
var3Input.name = "var3";
|
||||
var3Input.id = "var3Input";
|
||||
var3Input.value = var3Value;
|
||||
var3Input.classList.add("form-control");
|
||||
var3Input.disabled = true;
|
||||
var3Input.placeholder = ""
|
||||
var3Label.innerHTML = "-";
|
||||
var3Div.appendChild(var3Input);
|
||||
break;
|
||||
}
|
||||
case "xpath": {
|
||||
let var1Input = document.createElement("input");
|
||||
var1Input.name = "var1";
|
||||
|
|
|
@ -77,6 +77,8 @@ GoWatch Edit {{ .Watch.Name }}
|
|||
<select id="typeInput" class="form-control" name="type">
|
||||
<option value="gurl">Get URL</option>
|
||||
<option value="gurls">Get URLs</option>
|
||||
<option value="bgurl">Browserless Get URL</option>
|
||||
<option value="bgurls">Browserless Get URLs</option>
|
||||
<option value="xpath" selected="true">XPath</option>
|
||||
<option value="css">CSS</option>
|
||||
<option value="json">JSON</option>
|
||||
|
|
2
todo.md
2
todo.md
|
@ -6,7 +6,7 @@
|
|||
- edit.ts
|
||||
- diagram.ts
|
||||
- browserless function filters
|
||||
- split get url http.get and browserless get into 2 filters
|
||||
- ~~split get url http.get and browserless get into 2 filters~~
|
||||
- refactor amazon template
|
||||
- url path support
|
||||
- refactor project structure
|
Loading…
Add table
Reference in a new issue