From 6057506f7b0264eb65be455d76c078eeb48f82ee Mon Sep 17 00:00:00 2001 From: BroodjeAap Date: Tue, 31 Jan 2023 19:59:15 +0000 Subject: [PATCH] split get url(s) into two functions one 'normal' and one browserless get --- scraping.go | 147 ++++++++++++++++++++++++++------------ static/edit.js | 59 +++++++++++++++ static/edit.ts | 63 ++++++++++++++++ templates/watch/edit.html | 2 + todo.md | 2 +- 5 files changed, 225 insertions(+), 48 deletions(-) diff --git a/scraping.go b/scraping.go index cd36e48..d9ed4c8 100644 --- a/scraping.go +++ b/scraping.go @@ -3,6 +3,7 @@ package main import ( "bytes" "encoding/json" + "errors" "fmt" "html/template" "io/ioutil" @@ -126,6 +127,14 @@ func getFilterResult(filters []Filter, filter *Filter, watch *Watch, web *Web, d { getFilterResultURLs(filter, web.urlCache, debug) } + case filter.Type == "bgurl": + { + getFilterResultBrowserlessURL(filter, web.urlCache, debug) + } + case filter.Type == "bgurls": + { + getFilterResultBrowserlessURLs(filter, web.urlCache, debug) + } case filter.Type == "xpath": { getFilterResultXPath(filter) @@ -250,6 +259,8 @@ func getFilterResultURL(filter *Filter, urlCache map[string]string, debug bool) } str, err := getURLContent(filter, fetchURL) if err != nil { + log.Println("Could not fetch url: ", fetchURL, " - ", err) + filter.log("Could not fetch url: ", fetchURL, " - ", err) return } filter.Results = append(filter.Results, str) @@ -270,6 +281,8 @@ func getFilterResultURLs(filter *Filter, urlCache map[string]string, debug bool) str, err := getURLContent(filter, fetchURL) if err != nil { + log.Println("Could not fetch url: ", fetchURL, " - ", err) + filter.log("Could not fetch url: ", fetchURL, " - ", err) continue } filter.Results = append(filter.Results, str) @@ -281,62 +294,102 @@ func getFilterResultURLs(filter *Filter, urlCache map[string]string, debug bool) } func getURLContent(filter *Filter, fetchURL string) (string, error) { - var body []byte - if viper.IsSet("browserless.url") { - browserlessURL := viper.GetString("browserless.url") - data := struct { - URL string `json:"url"` - }{ - URL: fetchURL, - } - jsn, err := json.Marshal(data) + var httpClient *http.Client + if viper.IsSet("proxy.proxy_url") { + proxyUrl, err := url.Parse(viper.GetString("proxy.proxy_url")) if err != nil { - log.Println("Could not marshal url:", err) - filter.log("Could not marshal url:", err) - return "", err - } - resp, err := http.Post(browserlessURL, "application/json", bytes.NewBuffer(jsn)) - if err != nil { - log.Println("Could not get browserless response content:", err) - filter.log("Could not get browserless response content:", err) - return "", err - } - body, err = ioutil.ReadAll(resp.Body) - if err != nil { - log.Println("Could not fetch url through browserless: ", fetchURL, " - ", err) - filter.log("Could not fetch url through browserless: ", fetchURL, " - ", err) return "", err } + httpClient = &http.Client{Transport: &http.Transport{Proxy: http.ProxyURL(proxyUrl)}} } else { - var httpClient *http.Client - if viper.IsSet("proxy.proxy_url") { - proxyUrl, err := url.Parse(viper.GetString("proxy.proxy_url")) - if err != nil { - log.Println("Could not parse proxy url, check config") - filter.log("Could not parse proxy url, check config") - return "", err - } - httpClient = &http.Client{Transport: &http.Transport{Proxy: http.ProxyURL(proxyUrl)}} - } else { - httpClient = &http.Client{} - } - resp, err := httpClient.Get(fetchURL) - if err != nil { - log.Println("Could not fetch url: ", fetchURL, " - ", err) - filter.log("Could not fetch url: ", fetchURL, " - ", err) - return "", err - } - body, err = ioutil.ReadAll(resp.Body) - if err != nil { - log.Println("Could not fetch url: ", fetchURL, " - ", err) - filter.log("Could not fetch url: ", fetchURL, " - ", err) - return "", err - } + httpClient = &http.Client{} + } + resp, err := httpClient.Get(fetchURL) + if err != nil { + return "", err + } + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return "", err } return string(body), nil } +func getFilterResultBrowserlessURL(filter *Filter, urlCache map[string]string, debug bool) { + fetchURL := filter.Var1 + val, exists := urlCache["b"+fetchURL] + if debug && exists { + filter.Results = append(filter.Results, val) + return + } + str, err := getBrowserlessURLContent(filter, fetchURL) + if err != nil { + log.Println("Could not fetch url: ", fetchURL, " - ", err) + filter.log("Could not fetch url: ", fetchURL, " - ", err) + return + } + filter.Results = append(filter.Results, str) + if debug { + urlCache["b"+fetchURL] = str + } +} + +func getFilterResultBrowserlessURLs(filter *Filter, urlCache map[string]string, debug bool) { + for _, parent := range filter.Parents { + for _, result := range parent.Results { + fetchURL := result + val, exists := urlCache["b"+fetchURL] + if debug && exists { + filter.Results = append(filter.Results, val) + continue + } + + str, err := getBrowserlessURLContent(filter, fetchURL) + if err != nil { + log.Println("Could not fetch url: ", fetchURL, " - ", err) + filter.log("Could not fetch url: ", fetchURL, " - ", err) + continue + } + filter.Results = append(filter.Results, str) + if debug { + urlCache["b"+fetchURL] = str + } + } + } +} + +func getBrowserlessURLContent(filter *Filter, fetchURL string) (string, error) { + if !viper.IsSet("browserless.url") { + return "", errors.New("browserless.url not set") + } + browserlessURL := viper.GetString("browserless.url") + data := struct { + URL string `json:"url"` + }{ + URL: fetchURL, + } + jsn, err := json.Marshal(data) + if err != nil { + log.Println("Could not marshal url:", err) + filter.log("Could not marshal url:", err) + return "", err + } + resp, err := http.Post(browserlessURL, "application/json", bytes.NewBuffer(jsn)) + if err != nil { + log.Println("Could not get browserless response content:", err) + filter.log("Could not get browserless response content:", err) + return "", err + } + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + log.Println("Could not fetch url through browserless: ", fetchURL, " - ", err) + filter.log("Could not fetch url through browserless: ", fetchURL, " - ", err) + return "", err + } + return string(body), nil +} + func getFilterResultXPath(filter *Filter) { selectType := "node" if filter.Var2 != nil { diff --git a/static/edit.js b/static/edit.js index 79d6cf6..a38bd54 100644 --- a/static/edit.js +++ b/static/edit.js @@ -114,6 +114,65 @@ function onTypeChange(node) { var3Div.appendChild(var3Input); break; } + case "bgurl": { + var var1Input = document.createElement("input"); + var1Input.name = "var1"; + var1Input.id = "var1Input"; + var1Input.value = var1Value; + var1Input.classList.add("form-control"); + var1Label.innerHTML = "URL"; + var1Input.placeholder = "https://shopping.website.com"; + var1Div.appendChild(var1Input); + var var2Input = document.createElement("input"); + var2Input.name = "var2"; + var2Input.id = "var2Input"; + var2Input.value = var2Value; + var2Input.classList.add("form-control"); + var2Input.disabled = true; + var2Input.placeholder = ""; + var2Label.innerHTML = "-"; + var2Div.appendChild(var2Input); + var var3Input = document.createElement("input"); + var3Input.name = "var3"; + var3Input.id = "var3Input"; + var3Input.value = var3Value; + var3Input.classList.add("form-control"); + var3Input.disabled = true; + var3Input.placeholder = ""; + var3Label.innerHTML = "-"; + var3Div.appendChild(var3Input); + break; + } + case "bgurls": { + var var1Input = document.createElement("input"); + var1Input.name = "var1"; + var1Input.id = "var1Input"; + var1Input.value = var1Value; + var1Input.classList.add("form-control"); + var1Label.innerHTML = "-"; + var1Input.placeholder = "From parents"; + var1Input.disabled = true; + var1Div.appendChild(var1Input); + var var2Input = document.createElement("input"); + var2Input.name = "var2"; + var2Input.id = "var2Input"; + var2Input.value = var2Value; + var2Input.classList.add("form-control"); + var2Input.disabled = true; + var2Input.placeholder = ""; + var2Label.innerHTML = "-"; + var2Div.appendChild(var2Input); + var var3Input = document.createElement("input"); + var3Input.name = "var3"; + var3Input.id = "var3Input"; + var3Input.value = var3Value; + var3Input.classList.add("form-control"); + var3Input.disabled = true; + var3Input.placeholder = ""; + var3Label.innerHTML = "-"; + var3Div.appendChild(var3Input); + break; + } case "xpath": { var var1Input = document.createElement("input"); var1Input.name = "var1"; diff --git a/static/edit.ts b/static/edit.ts index eca51dd..3e5a650 100644 --- a/static/edit.ts +++ b/static/edit.ts @@ -90,6 +90,69 @@ function onTypeChange(node: DiagramNode | null = null){ var3Div.appendChild(var3Input); break; } + case "bgurl": { + let var1Input = document.createElement("input"); + var1Input.name = "var1"; + var1Input.id = "var1Input"; + var1Input.value = var1Value; + var1Input.classList.add("form-control"); + var1Label.innerHTML = "URL"; + var1Input.placeholder = "https://shopping.website.com"; + var1Div.appendChild(var1Input); + + let var2Input = document.createElement("input"); + var2Input.name = "var2"; + var2Input.id = "var2Input"; + var2Input.value = var2Value; + var2Input.classList.add("form-control"); + var2Input.disabled = true; + var2Input.placeholder = "" + var2Label.innerHTML = "-"; + var2Div.appendChild(var2Input); + + let var3Input = document.createElement("input"); + var3Input.name = "var3"; + var3Input.id = "var3Input"; + var3Input.value = var3Value; + var3Input.classList.add("form-control"); + var3Input.disabled = true; + var3Input.placeholder = "" + var3Label.innerHTML = "-"; + var3Div.appendChild(var3Input); + break; + } + case "bgurls": { + let var1Input = document.createElement("input"); + var1Input.name = "var1"; + var1Input.id = "var1Input"; + var1Input.value = var1Value; + var1Input.classList.add("form-control") + var1Label.innerHTML = "-"; + var1Input.placeholder = "From parents"; + var1Input.disabled = true; + var1Div.appendChild(var1Input); + + let var2Input = document.createElement("input"); + var2Input.name = "var2"; + var2Input.id = "var2Input"; + var2Input.value = var2Value; + var2Input.classList.add("form-control") + var2Input.disabled = true; + var2Input.placeholder = "" + var2Label.innerHTML = "-"; + var2Div.appendChild(var2Input); + + let var3Input = document.createElement("input"); + var3Input.name = "var3"; + var3Input.id = "var3Input"; + var3Input.value = var3Value; + var3Input.classList.add("form-control"); + var3Input.disabled = true; + var3Input.placeholder = "" + var3Label.innerHTML = "-"; + var3Div.appendChild(var3Input); + break; + } case "xpath": { let var1Input = document.createElement("input"); var1Input.name = "var1"; diff --git a/templates/watch/edit.html b/templates/watch/edit.html index 2c46612..9eb8582 100644 --- a/templates/watch/edit.html +++ b/templates/watch/edit.html @@ -77,6 +77,8 @@ GoWatch Edit {{ .Watch.Name }}