116 lines
2.5 KiB
Go
Raw Normal View History

2020-04-06 00:05:01 +05:30
package extractors
2021-01-12 11:21:32 +05:30
import (
2021-08-02 21:43:50 +05:30
"strings"
2021-07-31 22:49:23 +02:00
"encoding/json"
2021-08-01 14:42:04 +02:00
2021-08-02 21:43:50 +05:30
"github.com/antchfx/htmlquery"
2021-01-12 11:21:32 +05:30
"github.com/projectdiscovery/nuclei/v2/pkg/types"
)
2020-07-16 10:32:00 +02:00
2020-12-24 20:47:41 +05:30
// ExtractRegex extracts text from a corpus and returns it
func (e *Extractor) ExtractRegex(corpus string) map[string]struct{} {
2020-04-27 23:34:08 +05:30
results := make(map[string]struct{})
groupPlusOne := e.RegexGroup + 1
for _, regex := range e.regexCompiled {
matches := regex.FindAllStringSubmatch(corpus, -1)
2020-12-24 12:13:18 +05:30
2020-04-27 23:34:08 +05:30
for _, match := range matches {
2020-12-24 12:13:18 +05:30
if len(match) < groupPlusOne {
continue
}
2020-12-24 12:13:18 +05:30
matchString := match[e.RegexGroup]
2020-12-24 12:13:18 +05:30
if _, ok := results[matchString]; !ok {
results[matchString] = struct{}{}
}
2020-07-16 10:32:00 +02:00
}
}
return results
}
2020-12-24 20:47:41 +05:30
// ExtractKval extracts key value pairs from a data map
func (e *Extractor) ExtractKval(data map[string]interface{}) map[string]struct{} {
2020-07-16 10:32:00 +02:00
results := make(map[string]struct{})
2020-07-16 12:58:56 +02:00
for _, k := range e.KVal {
2020-12-24 12:13:18 +05:30
item, ok := data[k]
if !ok {
continue
}
itemString := types.ToString(item)
if _, ok := results[itemString]; !ok {
results[itemString] = struct{}{}
2020-07-16 10:32:00 +02:00
}
}
return results
}
2021-08-02 21:43:50 +05:30
// ExtractHTML extracts items from text using XPath selectors
func (e *Extractor) ExtractHTML(corpus string) map[string]struct{} {
results := make(map[string]struct{})
doc, err := htmlquery.Parse(strings.NewReader(corpus))
if err != nil {
return results
}
for _, k := range e.XPath {
nodes, err := htmlquery.QueryAll(doc, k)
if err != nil {
continue
}
for _, node := range nodes {
var value string
if e.Attribute != "" {
value = htmlquery.SelectAttr(node, e.Attribute)
} else {
value = htmlquery.InnerText(node)
}
if _, ok := results[value]; !ok {
results[value] = struct{}{}
}
}
}
return results
}
// ExtractJSON extracts text from a corpus using JQ queries and returns it
2021-08-01 14:42:04 +02:00
func (e *Extractor) ExtractJSON(corpus string) map[string]struct{} {
2021-07-31 22:49:23 +02:00
results := make(map[string]struct{})
var jsonObj interface{}
if err := json.Unmarshal([]byte(corpus), &jsonObj); err != nil {
2021-07-31 22:49:23 +02:00
return results
}
for _, k := range e.jsonCompiled {
iter := k.Run(jsonObj)
for {
v, ok := iter.Next()
if !ok {
break
}
if _, ok := v.(error); ok {
break
}
var result string
if res, err := types.JSONScalarToString(v); err == nil {
result = res
} else if res, err := json.Marshal(v); err == nil {
result = string(res)
} else {
result = types.ToString(v)
}
if _, ok := results[result]; !ok {
results[result] = struct{}{}
2021-07-31 22:49:23 +02:00
}
}
}
return results
}