diff --git a/cmd/nuclei/main.go b/cmd/nuclei/main.go index de29d6581..d536802b4 100644 --- a/cmd/nuclei/main.go +++ b/cmd/nuclei/main.go @@ -328,6 +328,7 @@ on extensive configurability, massive extensibility and ease of use.`) flagSet.BoolVar(&fuzzFlag, "fuzz", false, "enable loading fuzzing templates (Deprecated: use -dast instead)"), flagSet.BoolVar(&options.DAST, "dast", false, "enable / run dast (fuzz) nuclei templates"), flagSet.BoolVarP(&options.DisplayFuzzPoints, "display-fuzz-points", "dfp", false, "display fuzz points in the output for debugging"), + flagSet.BoolVarP(&options.FuzzingDedupe, "fuzzing-dedupe", "fd", false, "deduplicate fuzzing url inputs"), flagSet.IntVar(&options.FuzzParamFrequency, "fuzz-param-frequency", 10, "frequency of uninteresting parameters for fuzzing before skipping"), flagSet.StringVarP(&options.FuzzAggressionLevel, "fuzz-aggression", "fa", "low", "fuzzing aggression level controls payload count for fuzz (low, medium, high)"), ) diff --git a/internal/runner/options.go b/internal/runner/options.go index 2872b96a7..794cb3546 100644 --- a/internal/runner/options.go +++ b/internal/runner/options.go @@ -152,6 +152,9 @@ func ValidateOptions(options *types.Options) error { if options.Verbose && options.Silent { return errors.New("both verbose and silent mode specified") } + if options.FuzzingDedupe && options.Stream { + return errors.New("both fuzzing dedupe and stream mode specified") + } if (options.HeadlessOptionalArguments != nil || options.ShowBrowser || options.UseInstalledChrome) && !options.Headless { return errors.New("headless mode (-headless) is required if -ho, -sb, -sc or -lha are set") diff --git a/pkg/input/provider/dedupe/dedupe.go b/pkg/input/provider/dedupe/dedupe.go new file mode 100644 index 000000000..1e9e96659 --- /dev/null +++ b/pkg/input/provider/dedupe/dedupe.go @@ -0,0 +1,102 @@ +// Package dedupe implements a duplicate URL deduplication mechanism +// for Nuclei DAST or Fuzzing inputs. +// +// It is used to remove similar or non-relevant inputs from fuzzing +// or DAST scans to reduce the number of requests made. +package dedupe + +import ( + "fmt" + "net/url" + "regexp" + "slices" + "strings" + + mapsutil "github.com/projectdiscovery/utils/maps" +) + +// FuzzingDeduper is a deduper for fuzzing inputs +// +// The normalization works as follows: +// +// - The path is normalized to remove any trailing slashes +// - The query is normalized by templating the query parameters with their names +// TODO: Doesn't handle different values, everything is stripped. Maybe make it more flexible? +// - Numeric IDs in the path are replaced with {numeric_id} +// +// This allows us to deduplicate URLs with different query parameters +// or orders but the same structure or key names. +type FuzzingDeduper struct { + items *mapsutil.SyncLockMap[string, struct{}] +} + +// NewFuzzingDeduper creates a new fuzzing deduper +func NewFuzzingDeduper() *FuzzingDeduper { + return &FuzzingDeduper{ + items: mapsutil.NewSyncLockMap[string, struct{}](), + } +} + +// Add adds a new URL to the deduper +func (d *FuzzingDeduper) Add(URL string) bool { + generatedPattern, err := generatePattern(URL) + if err != nil { + return false + } + + _, found := d.items.Get(generatedPattern) + if found { + return false + } + d.items.Set(generatedPattern, struct{}{}) + return true +} + +func generatePattern(urlStr string) (string, error) { + parsedURL, err := url.ParseRequestURI(urlStr) + if err != nil { + return "", err + } + + path := normalizePath(parsedURL.Path) + query := extractQuery(parsedURL.Query()) + + var builder strings.Builder + builder.Grow(len(urlStr)) + builder.WriteString(parsedURL.Scheme) + builder.WriteString("://") + builder.WriteString(parsedURL.Host) + builder.WriteString(path) + if query != "" { + builder.WriteString("?") + builder.WriteString(query) + } + pattern := builder.String() + return pattern, nil +} + +var ( + numericIDPathRegex = regexp.MustCompile(`/(\d+)(?:/|$)`) +) + +func normalizePath(path string) string { + subMatches := numericIDPathRegex.FindAllStringSubmatch(path, -1) + for _, match := range subMatches { + path = strings.ReplaceAll(path, match[0], "/{numeric_id}") + } + return path +} + +func extractQuery(query url.Values) string { + normalizedParams := make([]string, 0, len(query)) + + for k, v := range query { + if len(v) == 0 { + normalizedParams = append(normalizedParams, k) + } else { + normalizedParams = append(normalizedParams, fmt.Sprintf("%s={%s}", k, k)) + } + } + slices.Sort(normalizedParams) + return strings.Join(normalizedParams, "&") +} diff --git a/pkg/input/provider/dedupe/dedupe_test.go b/pkg/input/provider/dedupe/dedupe_test.go new file mode 100644 index 000000000..38af74686 --- /dev/null +++ b/pkg/input/provider/dedupe/dedupe_test.go @@ -0,0 +1,137 @@ +package dedupe + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestFuzzingDeduper(t *testing.T) { + t.Run("Basic URL Deduplication", func(t *testing.T) { + tests := []struct { + name string + urls []string + expected []bool + }{ + { + name: "Simple unique URLs", + urls: []string{"http://example.com/page1", "http://example.com/page2"}, + expected: []bool{true, true}, + }, + { + name: "Duplicate URLs", + urls: []string{"http://example.com/page1", "http://example.com/page1"}, + expected: []bool{true, false}, + }, + { + name: "URLs with different query param values", + urls: []string{"http://example.com/page?id=1", "http://example.com/page?id=2"}, + expected: []bool{true, false}, + }, + { + name: "URLs with different query param orders", + urls: []string{"http://example.com/page?a=1&b=2", "http://example.com/page?b=2&a=1"}, + expected: []bool{true, false}, + }, + { + name: "URLs with and without trailing slash", + urls: []string{"http://example.com/page/", "http://example.com/page"}, + expected: []bool{true, true}, + }, + { + name: "URLs with different schemes", + urls: []string{"http://example.com", "https://example.com"}, + expected: []bool{true, true}, + }, + { + name: "URLs with query params and without", + urls: []string{"http://example.com/page", "http://example.com/page?param=value"}, + expected: []bool{true, true}, + }, + { + name: "Invalid URLs", + urls: []string{"http://example.com/page", "not a valid url"}, + expected: []bool{true, false}, + }, + { + name: "URLs with empty query params", + urls: []string{"http://example.com/page?param1=¶m2=", "http://example.com/page?param2=¶m1="}, + expected: []bool{true, false}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + deduper := NewFuzzingDeduper() + for i, url := range tt.urls { + result := deduper.Add(url) + require.Equal(t, tt.expected[i], result, "Add(%q) = %v, want %v", url, result, tt.expected[i]) + } + }) + } + }) + + t.Run("Large Set Deduplication", func(t *testing.T) { + deduper := NewFuzzingDeduper() + baseURL := "http://example.com/page?id=%d¶m=%s" + + for i := 0; i < 1000; i++ { + url := fmt.Sprintf(baseURL, i, "value") + result := deduper.Add(url) + if i == 0 { + require.True(t, result, "First URL should be added") + } else { + require.False(t, result, "Duplicate URL pattern should not be added: %s", url) + } + } + + allItems := deduper.items.GetAll() + require.Len(t, allItems, 1, "Expected 1 unique URL pattern, got %d", len(allItems)) + }) + + t.Run("Path Parameters", func(t *testing.T) { + deduper := NewFuzzingDeduper() + + require.True(t, deduper.Add("https://example.com/page/1337")) + require.False(t, deduper.Add("https://example.com/page/1332")) + }) + + t.Run("TestPHP Vulnweb URLs", func(t *testing.T) { + urls := []string{ + "http://testphp.vulnweb.com/hpp/?pp=12", + "http://testphp.vulnweb.com/hpp/params.php?p=valid&pp=12", + "http://testphp.vulnweb.com/artists.php?artist=3", + "http://testphp.vulnweb.com/artists.php?artist=1", + "http://testphp.vulnweb.com/artists.php?artist=2", + "http://testphp.vulnweb.com/listproducts.php?artist=3", + "http://testphp.vulnweb.com/listproducts.php?cat=4", + "http://testphp.vulnweb.com/listproducts.php?cat=3", + "http://testphp.vulnweb.com/listproducts.php?cat=2", + "http://testphp.vulnweb.com/listproducts.php?artist=2", + "http://testphp.vulnweb.com/listproducts.php?artist=1", + "http://testphp.vulnweb.com/listproducts.php?cat=1", + "http://testphp.vulnweb.com/showimage.php?file=./pictures/6.jpg", + "http://testphp.vulnweb.com/product.php?pic=6", + "http://testphp.vulnweb.com/showimage.php?file=./pictures/6.jpg&size=160", + } + + expectedUnique := 8 + + deduper := NewFuzzingDeduper() + uniqueCount := 0 + + for _, url := range urls { + if deduper.Add(url) { + uniqueCount++ + } + } + + require.Equal(t, expectedUnique, uniqueCount, "Expected %d unique URLs, but got %d", expectedUnique, uniqueCount) + + // Test for duplicates + for _, url := range urls { + require.False(t, deduper.Add(url), "URL should have been identified as duplicate: %s", url) + } + }) +} diff --git a/pkg/input/provider/list/hmap.go b/pkg/input/provider/list/hmap.go index edf372919..3d832bb4d 100644 --- a/pkg/input/provider/list/hmap.go +++ b/pkg/input/provider/list/hmap.go @@ -19,6 +19,7 @@ import ( "github.com/projectdiscovery/hmap/filekv" "github.com/projectdiscovery/hmap/store/hybrid" "github.com/projectdiscovery/mapcidr/asn" + "github.com/projectdiscovery/nuclei/v3/pkg/input/provider/dedupe" providerTypes "github.com/projectdiscovery/nuclei/v3/pkg/input/types" "github.com/projectdiscovery/nuclei/v3/pkg/protocols/common/contextargs" "github.com/projectdiscovery/nuclei/v3/pkg/protocols/common/protocolstate" @@ -48,6 +49,8 @@ type ListInputProvider struct { hostMapStream *filekv.FileDB hostMapStreamOnce sync.Once sync.Once + + fuzzDeduper *dedupe.FuzzingDeduper } // Options is a wrapper around types.Options structure @@ -78,6 +81,9 @@ func New(opts *Options) (*ListInputProvider, error) { }, excludedHosts: make(map[string]struct{}), } + if options.FuzzingDedupe { + input.fuzzDeduper = dedupe.NewFuzzingDeduper() + } if options.Stream { fkvOptions := filekv.DefaultOptions fkvOptions.MaxItems = DefaultMaxDedupeItemsCount @@ -472,6 +478,12 @@ func (i *ListInputProvider) setItem(metaInput *contextargs.MetaInput) { } i.inputCount++ // tracks target count + if i.fuzzDeduper != nil { + if !i.fuzzDeduper.Add(metaInput.Target()) { + gologger.Verbose().Msgf("Ignoring duplicate fuzzing target: %s\n", metaInput.Target()) + return + } + } _ = i.hostMap.Set(key, nil) if i.hostMapStream != nil { i.setHostMapStream(key) diff --git a/pkg/types/types.go b/pkg/types/types.go index cab1aacf5..4c053feb7 100644 --- a/pkg/types/types.go +++ b/pkg/types/types.go @@ -276,6 +276,8 @@ type Options struct { StoreResponseDir string // DisableRedirects disables following redirects for http request module DisableRedirects bool + // FuzzingDedupe enables deduplication of input URLs for fuzzing + FuzzingDedupe bool // SNI custom hostname SNI string // InputFileMode specifies the mode of input file (jsonl, burp, openapi, swagger, etc)