package extractors import ( "strings" "github.com/antchfx/htmlquery" "github.com/projectdiscovery/nuclei/v2/pkg/types" ) // ExtractRegex extracts text from a corpus and returns it func (e *Extractor) ExtractRegex(corpus string) map[string]struct{} { results := make(map[string]struct{}) groupPlusOne := e.RegexGroup + 1 for _, regex := range e.regexCompiled { matches := regex.FindAllStringSubmatch(corpus, -1) for _, match := range matches { if len(match) < groupPlusOne { continue } matchString := match[e.RegexGroup] if _, ok := results[matchString]; !ok { results[matchString] = struct{}{} } } } return results } // ExtractKval extracts key value pairs from a data map func (e *Extractor) ExtractKval(data map[string]interface{}) map[string]struct{} { results := make(map[string]struct{}) for _, k := range e.KVal { item, ok := data[k] if !ok { continue } itemString := types.ToString(item) if _, ok := results[itemString]; !ok { results[itemString] = struct{}{} } } return results } // ExtractHTML extracts items from text using XPath selectors func (e *Extractor) ExtractHTML(corpus string) map[string]struct{} { results := make(map[string]struct{}) doc, err := htmlquery.Parse(strings.NewReader(corpus)) if err != nil { return results } for _, k := range e.XPath { nodes, err := htmlquery.QueryAll(doc, k) if err != nil { continue } for _, node := range nodes { var value string if e.Attribute != "" { value = htmlquery.SelectAttr(node, e.Attribute) } else { value = htmlquery.InnerText(node) } if _, ok := results[value]; !ok { results[value] = struct{}{} } } } return results }