added innertHTML/attributes/node options to xpath/css filters
This commit is contained in:
parent
306b41a11f
commit
102fce7c17
4 changed files with 172 additions and 42 deletions
70
scraping.go
70
scraping.go
|
@ -338,6 +338,10 @@ func getURLContent(filter *Filter, fetchURL string) (string, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResultXPath(filter *Filter) {
|
func getFilterResultXPath(filter *Filter) {
|
||||||
|
selectType := "node"
|
||||||
|
if filter.Var2 != nil {
|
||||||
|
selectType = *filter.Var2
|
||||||
|
}
|
||||||
for _, parent := range filter.Parents {
|
for _, parent := range filter.Parents {
|
||||||
for _, result := range parent.Results {
|
for _, result := range parent.Results {
|
||||||
doc, err := htmlquery.Parse(strings.NewReader(result))
|
doc, err := htmlquery.Parse(strings.NewReader(result))
|
||||||
|
@ -347,9 +351,40 @@ func getFilterResultXPath(filter *Filter) {
|
||||||
}
|
}
|
||||||
nodes, _ := htmlquery.QueryAll(doc, filter.Var1)
|
nodes, _ := htmlquery.QueryAll(doc, filter.Var1)
|
||||||
for _, node := range nodes {
|
for _, node := range nodes {
|
||||||
|
switch selectType {
|
||||||
|
case "inner":
|
||||||
|
{
|
||||||
|
// if the child is a text node, theres nothing else (?), so just append that
|
||||||
|
if node.FirstChild != nil && node.FirstChild.Type == html.TextNode {
|
||||||
|
filter.Results = append(filter.Results, html.UnescapeString(node.FirstChild.Data))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// else, theres more nodes, turn them all into a string and add that as a result
|
||||||
|
var result bytes.Buffer
|
||||||
|
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||||
|
var b bytes.Buffer
|
||||||
|
html.Render(&b, node)
|
||||||
|
result.WriteString(b.String())
|
||||||
|
}
|
||||||
|
filter.Results = append(filter.Results, html.UnescapeString(result.String()))
|
||||||
|
break
|
||||||
|
}
|
||||||
|
case "attr":
|
||||||
|
{
|
||||||
|
for _, attr := range node.Attr {
|
||||||
|
result := fmt.Sprintf("%s=\"%s\"", attr.Key, attr.Val)
|
||||||
|
filter.Results = append(filter.Results, html.UnescapeString(result))
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
{
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
html.Render(&b, node)
|
html.Render(&b, node)
|
||||||
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
|
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -366,6 +401,10 @@ func getFilterResultJSON(filter *Filter) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFilterResultCSS(filter *Filter) {
|
func getFilterResultCSS(filter *Filter) {
|
||||||
|
selectType := "node"
|
||||||
|
if filter.Var2 != nil {
|
||||||
|
selectType = *filter.Var2
|
||||||
|
}
|
||||||
for _, parent := range filter.Parents {
|
for _, parent := range filter.Parents {
|
||||||
for _, result := range parent.Results {
|
for _, result := range parent.Results {
|
||||||
doc, err := html.Parse(strings.NewReader(result))
|
doc, err := html.Parse(strings.NewReader(result))
|
||||||
|
@ -379,9 +418,40 @@ func getFilterResultCSS(filter *Filter) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, node := range cascadia.QueryAll(doc, sel) {
|
for _, node := range cascadia.QueryAll(doc, sel) {
|
||||||
|
switch selectType {
|
||||||
|
case "inner":
|
||||||
|
{
|
||||||
|
// if the child is a text node, theres nothing else (?), so just append that
|
||||||
|
if node.FirstChild != nil && node.FirstChild.Type == html.TextNode {
|
||||||
|
filter.Results = append(filter.Results, html.UnescapeString(node.FirstChild.Data))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// else, theres more nodes, turn them all into a string and add that as a result
|
||||||
|
var result bytes.Buffer
|
||||||
|
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||||
|
var b bytes.Buffer
|
||||||
|
html.Render(&b, node)
|
||||||
|
result.WriteString(b.String())
|
||||||
|
}
|
||||||
|
filter.Results = append(filter.Results, html.UnescapeString(result.String()))
|
||||||
|
break
|
||||||
|
}
|
||||||
|
case "attr":
|
||||||
|
{
|
||||||
|
for _, attr := range node.Attr {
|
||||||
|
result := fmt.Sprintf("%s=\"%s\"", attr.Key, attr.Val)
|
||||||
|
filter.Results = append(filter.Results, html.UnescapeString(result))
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
{
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
html.Render(&b, node)
|
html.Render(&b, node)
|
||||||
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
|
filter.Results = append(filter.Results, html.UnescapeString(b.String()))
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -123,15 +123,30 @@ function onTypeChange(node) {
|
||||||
var1Label.innerHTML = "XPath";
|
var1Label.innerHTML = "XPath";
|
||||||
var1Input.placeholder = "//a[@class='price]";
|
var1Input.placeholder = "//a[@class='price]";
|
||||||
var1Div.appendChild(var1Input);
|
var1Div.appendChild(var1Input);
|
||||||
var var2Input = document.createElement("input");
|
var select_1 = document.createElement("select");
|
||||||
var2Input.name = "var2";
|
select_1.name = "var2";
|
||||||
var2Input.id = "var2Input";
|
select_1.id = "var2Input";
|
||||||
var2Input.value = var2Value;
|
select_1.classList.add("form-control");
|
||||||
var2Input.classList.add("form-control");
|
var innerHTML = document.createElement("option");
|
||||||
var2Input.disabled = true;
|
innerHTML.value = "inner";
|
||||||
var2Input.placeholder = "";
|
innerHTML.innerHTML = "innerHTML";
|
||||||
var2Label.innerHTML = "-";
|
select_1.appendChild(innerHTML);
|
||||||
var2Div.appendChild(var2Input);
|
var attributes = document.createElement("option");
|
||||||
|
attributes.value = "attr";
|
||||||
|
attributes.innerHTML = "Attributes";
|
||||||
|
select_1.appendChild(attributes);
|
||||||
|
var node_1 = document.createElement("option");
|
||||||
|
node_1.value = "node";
|
||||||
|
node_1.innerHTML = "Node";
|
||||||
|
select_1.appendChild(node_1);
|
||||||
|
var2Div.appendChild(select_1);
|
||||||
|
var2Label.innerHTML = "Select";
|
||||||
|
if (var2Value == "") {
|
||||||
|
select_1.value = "inner";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
select_1.value = var2Value;
|
||||||
|
}
|
||||||
var var3Input = document.createElement("input");
|
var var3Input = document.createElement("input");
|
||||||
var3Input.name = "var3";
|
var3Input.name = "var3";
|
||||||
var3Input.id = "var3Input";
|
var3Input.id = "var3Input";
|
||||||
|
@ -179,14 +194,30 @@ function onTypeChange(node) {
|
||||||
var1Label.innerHTML = "Selector";
|
var1Label.innerHTML = "Selector";
|
||||||
var1Input.placeholder = ".price";
|
var1Input.placeholder = ".price";
|
||||||
var1Div.appendChild(var1Input);
|
var1Div.appendChild(var1Input);
|
||||||
var var2Input = document.createElement("input");
|
var select_2 = document.createElement("select");
|
||||||
var2Input.name = "var2";
|
select_2.name = "var2";
|
||||||
var2Input.id = "var2Input";
|
select_2.id = "var2Input";
|
||||||
var2Input.value = var2Value;
|
select_2.classList.add("form-control");
|
||||||
var2Input.classList.add("form-control");
|
var innerHTML = document.createElement("option");
|
||||||
var2Input.disabled = true;
|
innerHTML.value = "inner";
|
||||||
var2Label.innerHTML = "-";
|
innerHTML.innerHTML = "innerHTML";
|
||||||
var2Div.appendChild(var2Input);
|
select_2.appendChild(innerHTML);
|
||||||
|
var attributes = document.createElement("option");
|
||||||
|
attributes.value = "attr";
|
||||||
|
attributes.innerHTML = "Attributes";
|
||||||
|
select_2.appendChild(attributes);
|
||||||
|
var node_2 = document.createElement("option");
|
||||||
|
node_2.value = "node";
|
||||||
|
node_2.innerHTML = "Node";
|
||||||
|
select_2.appendChild(node_2);
|
||||||
|
var2Div.appendChild(select_2);
|
||||||
|
var2Label.innerHTML = "Select";
|
||||||
|
if (var2Value == "") {
|
||||||
|
select_2.value = "inner";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
select_2.value = var2Value;
|
||||||
|
}
|
||||||
var var3Input = document.createElement("input");
|
var var3Input = document.createElement("input");
|
||||||
var3Input.name = "var3";
|
var3Input.name = "var3";
|
||||||
var3Input.id = "var3Input";
|
var3Input.id = "var3Input";
|
||||||
|
@ -748,13 +779,13 @@ function onConditionChange(node) {
|
||||||
filterSelect.classList.add("form-control");
|
filterSelect.classList.add("form-control");
|
||||||
try {
|
try {
|
||||||
for (var _b = __values(_diagram.nodes.values()), _c = _b.next(); !_c.done; _c = _b.next()) {
|
for (var _b = __values(_diagram.nodes.values()), _c = _b.next(); !_c.done; _c = _b.next()) {
|
||||||
var node_1 = _c.value;
|
var node_3 = _c.value;
|
||||||
if (node_1.type != "store") {
|
if (node_3.type != "store") {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
var nodeOption = document.createElement("option");
|
var nodeOption = document.createElement("option");
|
||||||
nodeOption.value = node_1.label;
|
nodeOption.value = node_3.label;
|
||||||
nodeOption.innerHTML = node_1.label;
|
nodeOption.innerHTML = node_3.label;
|
||||||
filterSelect.appendChild(nodeOption);
|
filterSelect.appendChild(nodeOption);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -100,15 +100,29 @@ function onTypeChange(node: DiagramNode | null = null){
|
||||||
var1Input.placeholder = "//a[@class='price]";
|
var1Input.placeholder = "//a[@class='price]";
|
||||||
var1Div.appendChild(var1Input);
|
var1Div.appendChild(var1Input);
|
||||||
|
|
||||||
let var2Input = document.createElement("input");
|
let select = document.createElement("select");
|
||||||
var2Input.name = "var2";
|
select.name = "var2";
|
||||||
var2Input.id = "var2Input";
|
select.id = "var2Input";
|
||||||
var2Input.value = var2Value;
|
select.classList.add("form-control");
|
||||||
var2Input.classList.add("form-control")
|
let innerHTML = document.createElement("option");
|
||||||
var2Input.disabled = true;
|
innerHTML.value = "inner"
|
||||||
var2Input.placeholder = "";
|
innerHTML.innerHTML = "innerHTML";
|
||||||
var2Label.innerHTML = "-";
|
select.appendChild(innerHTML);
|
||||||
var2Div.appendChild(var2Input);
|
let attributes = document.createElement("option");
|
||||||
|
attributes.value = "attr"
|
||||||
|
attributes.innerHTML = "Attributes";
|
||||||
|
select.appendChild(attributes);
|
||||||
|
let node = document.createElement("option");
|
||||||
|
node.value = "node"
|
||||||
|
node.innerHTML = "Node";
|
||||||
|
select.appendChild(node);
|
||||||
|
var2Div.appendChild(select);
|
||||||
|
var2Label.innerHTML = "Select";
|
||||||
|
if (var2Value == ""){
|
||||||
|
select.value = "inner";
|
||||||
|
} else {
|
||||||
|
select.value = var2Value;
|
||||||
|
}
|
||||||
|
|
||||||
let var3Input = document.createElement("input");
|
let var3Input = document.createElement("input");
|
||||||
var3Input.name = "var3";
|
var3Input.name = "var3";
|
||||||
|
@ -160,14 +174,29 @@ function onTypeChange(node: DiagramNode | null = null){
|
||||||
var1Input.placeholder = ".price";
|
var1Input.placeholder = ".price";
|
||||||
var1Div.appendChild(var1Input);
|
var1Div.appendChild(var1Input);
|
||||||
|
|
||||||
let var2Input = document.createElement("input");
|
let select = document.createElement("select");
|
||||||
var2Input.name = "var2";
|
select.name = "var2";
|
||||||
var2Input.id = "var2Input";
|
select.id = "var2Input";
|
||||||
var2Input.value = var2Value;
|
select.classList.add("form-control");
|
||||||
var2Input.classList.add("form-control")
|
let innerHTML = document.createElement("option");
|
||||||
var2Input.disabled = true;
|
innerHTML.value = "inner"
|
||||||
var2Label.innerHTML = "-";
|
innerHTML.innerHTML = "innerHTML";
|
||||||
var2Div.appendChild(var2Input);
|
select.appendChild(innerHTML);
|
||||||
|
let attributes = document.createElement("option");
|
||||||
|
attributes.value = "attr"
|
||||||
|
attributes.innerHTML = "Attributes";
|
||||||
|
select.appendChild(attributes);
|
||||||
|
let node = document.createElement("option");
|
||||||
|
node.value = "node"
|
||||||
|
node.innerHTML = "Node";
|
||||||
|
select.appendChild(node);
|
||||||
|
var2Div.appendChild(select);
|
||||||
|
var2Label.innerHTML = "Select";
|
||||||
|
if (var2Value == ""){
|
||||||
|
select.value = "inner";
|
||||||
|
} else {
|
||||||
|
select.value = var2Value;
|
||||||
|
}
|
||||||
|
|
||||||
let var3Input = document.createElement("input");
|
let var3Input = document.createElement("input");
|
||||||
var3Input.name = "var3";
|
var3Input.name = "var3";
|
||||||
|
|
8
todo.md
8
todo.md
|
@ -9,8 +9,8 @@
|
||||||
- json
|
- json
|
||||||
- add index to docs/compose to fix link in pages
|
- add index to docs/compose to fix link in pages
|
||||||
- safe escape {{ }} for pages
|
- safe escape {{ }} for pages
|
||||||
- xpath/css innerthtml option?
|
- ~~xpath/css innerthtml option?~~
|
||||||
- inner
|
- ~~inner~~
|
||||||
- attributes
|
- ~~attributes~~
|
||||||
- node
|
- ~~node~~
|
||||||
- tests for all of it
|
- tests for all of it
|
Loading…
Add table
Reference in a new issue