feat: added initial fuzz input url deduplication implementation

2025-12-29 22:23:02 +00:00 · 2024-09-03 19:16:44 +05:30 · 2024-09-03 19:16:44 +05:30 · fa6cac181e
commit fa6cac181e
parent 042b33de3d
6 changed files with 257 additions and 0 deletions
--- a/cmd/nuclei/main.go
+++ b/cmd/nuclei/main.go
@ -328,6 +328,7 @@ on extensive configurability, massive extensibility and ease of use.`)
 		flagSet.BoolVar(&fuzzFlag, "fuzz", false, "enable loading fuzzing templates (Deprecated: use -dast instead)"),
 		flagSet.BoolVar(&options.DAST, "dast", false, "enable / run dast (fuzz) nuclei templates"),
 		flagSet.BoolVarP(&options.DisplayFuzzPoints, "display-fuzz-points", "dfp", false, "display fuzz points in the output for debugging"),
+		flagSet.BoolVarP(&options.FuzzingDedupe, "fuzzing-dedupe", "fd", false, "deduplicate fuzzing url inputs"),
 		flagSet.IntVar(&options.FuzzParamFrequency, "fuzz-param-frequency", 10, "frequency of uninteresting parameters for fuzzing before skipping"),
 		flagSet.StringVarP(&options.FuzzAggressionLevel, "fuzz-aggression", "fa", "low", "fuzzing aggression level controls payload count for fuzz (low, medium, high)"),
 	)
--- a/internal/runner/options.go
+++ b/internal/runner/options.go
@ -152,6 +152,9 @@ func ValidateOptions(options *types.Options) error {
 	if options.Verbose && options.Silent {
 		return errors.New("both verbose and silent mode specified")
 	}
+	if options.FuzzingDedupe && options.Stream {
+		return errors.New("both fuzzing dedupe and stream mode specified")
+	}

 	if (options.HeadlessOptionalArguments != nil || options.ShowBrowser || options.UseInstalledChrome) && !options.Headless {
 		return errors.New("headless mode (-headless) is required if -ho, -sb, -sc or -lha are set")
--- a/pkg/input/provider/dedupe/dedupe.go
+++ b/pkg/input/provider/dedupe/dedupe.go
@ -0,0 +1,102 @@
+// Package dedupe implements a duplicate URL deduplication mechanism
+// for Nuclei DAST or Fuzzing inputs.
+//
+// It is used to remove similar or non-relevant inputs from fuzzing
+// or DAST scans to reduce the number of requests made.
+package dedupe
+
+import (
+	"fmt"
+	"net/url"
+	"regexp"
+	"slices"
+	"strings"
+
+	mapsutil "github.com/projectdiscovery/utils/maps"
+)
+
+// FuzzingDeduper is a deduper for fuzzing inputs
+//
+// The normalization works as follows:
+//
+//   - The path is normalized to remove any trailing slashes
+//   - The query is normalized by templating the query parameters with their names
+//     TODO: Doesn't handle different values, everything is stripped. Maybe make it more flexible?
+//   - Numeric IDs in the path are replaced with {numeric_id}
+//
+// This allows us to deduplicate URLs with different query parameters
+// or orders but the same structure or key names.
+type FuzzingDeduper struct {
+	items *mapsutil.SyncLockMap[string, struct{}]
+}
+
+// NewFuzzingDeduper creates a new fuzzing deduper
+func NewFuzzingDeduper() *FuzzingDeduper {
+	return &FuzzingDeduper{
+		items: mapsutil.NewSyncLockMap[string, struct{}](),
+	}
+}
+
+// Add adds a new URL to the deduper
+func (d *FuzzingDeduper) Add(URL string) bool {
+	generatedPattern, err := generatePattern(URL)
+	if err != nil {
+		return false
+	}
+
+	_, found := d.items.Get(generatedPattern)
+	if found {
+		return false
+	}
+	d.items.Set(generatedPattern, struct{}{})
+	return true
+}
+
+func generatePattern(urlStr string) (string, error) {
+	parsedURL, err := url.ParseRequestURI(urlStr)
+	if err != nil {
+		return "", err
+	}
+
+	path := normalizePath(parsedURL.Path)
+	query := extractQuery(parsedURL.Query())
+
+	var builder strings.Builder
+	builder.Grow(len(urlStr))
+	builder.WriteString(parsedURL.Scheme)
+	builder.WriteString("://")
+	builder.WriteString(parsedURL.Host)
+	builder.WriteString(path)
+	if query != "" {
+		builder.WriteString("?")
+		builder.WriteString(query)
+	}
+	pattern := builder.String()
+	return pattern, nil
+}
+
+var (
+	numericIDPathRegex = regexp.MustCompile(`/(\d+)(?:/|$)`)
+)
+
+func normalizePath(path string) string {
+	subMatches := numericIDPathRegex.FindAllStringSubmatch(path, -1)
+	for _, match := range subMatches {
+		path = strings.ReplaceAll(path, match[0], "/{numeric_id}")
+	}
+	return path
+}
+
+func extractQuery(query url.Values) string {
+	normalizedParams := make([]string, 0, len(query))
+
+	for k, v := range query {
+		if len(v) == 0 {
+			normalizedParams = append(normalizedParams, k)
+		} else {
+			normalizedParams = append(normalizedParams, fmt.Sprintf("%s={%s}", k, k))
+		}
+	}
+	slices.Sort(normalizedParams)
+	return strings.Join(normalizedParams, "&")
+}
--- a/pkg/input/provider/dedupe/dedupe_test.go
+++ b/pkg/input/provider/dedupe/dedupe_test.go
@ -0,0 +1,137 @@
+package dedupe
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestFuzzingDeduper(t *testing.T) {
+	t.Run("Basic URL Deduplication", func(t *testing.T) {
+		tests := []struct {
+			name     string
+			urls     []string
+			expected []bool
+		}{
+			{
+				name:     "Simple unique URLs",
+				urls:     []string{"http://example.com/page1", "http://example.com/page2"},
+				expected: []bool{true, true},
+			},
+			{
+				name:     "Duplicate URLs",
+				urls:     []string{"http://example.com/page1", "http://example.com/page1"},
+				expected: []bool{true, false},
+			},
+			{
+				name:     "URLs with different query param values",
+				urls:     []string{"http://example.com/page?id=1", "http://example.com/page?id=2"},
+				expected: []bool{true, false},
+			},
+			{
+				name:     "URLs with different query param orders",
+				urls:     []string{"http://example.com/page?a=1&b=2", "http://example.com/page?b=2&a=1"},
+				expected: []bool{true, false},
+			},
+			{
+				name:     "URLs with and without trailing slash",
+				urls:     []string{"http://example.com/page/", "http://example.com/page"},
+				expected: []bool{true, true},
+			},
+			{
+				name:     "URLs with different schemes",
+				urls:     []string{"http://example.com", "https://example.com"},
+				expected: []bool{true, true},
+			},
+			{
+				name:     "URLs with query params and without",
+				urls:     []string{"http://example.com/page", "http://example.com/page?param=value"},
+				expected: []bool{true, true},
+			},
+			{
+				name:     "Invalid URLs",
+				urls:     []string{"http://example.com/page", "not a valid url"},
+				expected: []bool{true, false},
+			},
+			{
+				name:     "URLs with empty query params",
+				urls:     []string{"http://example.com/page?param1=&param2=", "http://example.com/page?param2=&param1="},
+				expected: []bool{true, false},
+			},
+		}
+
+		for _, tt := range tests {
+			t.Run(tt.name, func(t *testing.T) {
+				deduper := NewFuzzingDeduper()
+				for i, url := range tt.urls {
+					result := deduper.Add(url)
+					require.Equal(t, tt.expected[i], result, "Add(%q) = %v, want %v", url, result, tt.expected[i])
+				}
+			})
+		}
+	})
+
+	t.Run("Large Set Deduplication", func(t *testing.T) {
+		deduper := NewFuzzingDeduper()
+		baseURL := "http://example.com/page?id=%d&param=%s"
+
+		for i := 0; i < 1000; i++ {
+			url := fmt.Sprintf(baseURL, i, "value")
+			result := deduper.Add(url)
+			if i == 0 {
+				require.True(t, result, "First URL should be added")
+			} else {
+				require.False(t, result, "Duplicate URL pattern should not be added: %s", url)
+			}
+		}
+
+		allItems := deduper.items.GetAll()
+		require.Len(t, allItems, 1, "Expected 1 unique URL pattern, got %d", len(allItems))
+	})
+
+	t.Run("Path Parameters", func(t *testing.T) {
+		deduper := NewFuzzingDeduper()
+
+		require.True(t, deduper.Add("https://example.com/page/1337"))
+		require.False(t, deduper.Add("https://example.com/page/1332"))
+	})
+
+	t.Run("TestPHP Vulnweb URLs", func(t *testing.T) {
+		urls := []string{
+			"http://testphp.vulnweb.com/hpp/?pp=12",
+			"http://testphp.vulnweb.com/hpp/params.php?p=valid&pp=12",
+			"http://testphp.vulnweb.com/artists.php?artist=3",
+			"http://testphp.vulnweb.com/artists.php?artist=1",
+			"http://testphp.vulnweb.com/artists.php?artist=2",
+			"http://testphp.vulnweb.com/listproducts.php?artist=3",
+			"http://testphp.vulnweb.com/listproducts.php?cat=4",
+			"http://testphp.vulnweb.com/listproducts.php?cat=3",
+			"http://testphp.vulnweb.com/listproducts.php?cat=2",
+			"http://testphp.vulnweb.com/listproducts.php?artist=2",
+			"http://testphp.vulnweb.com/listproducts.php?artist=1",
+			"http://testphp.vulnweb.com/listproducts.php?cat=1",
+			"http://testphp.vulnweb.com/showimage.php?file=./pictures/6.jpg",
+			"http://testphp.vulnweb.com/product.php?pic=6",
+			"http://testphp.vulnweb.com/showimage.php?file=./pictures/6.jpg&size=160",
+		}
+
+		expectedUnique := 8
+
+		deduper := NewFuzzingDeduper()
+		uniqueCount := 0
+
+		for _, url := range urls {
+			if deduper.Add(url) {
+				uniqueCount++
+			}
+		}
+
+		require.Equal(t, expectedUnique, uniqueCount, "Expected %d unique URLs, but got %d", expectedUnique, uniqueCount)
+
+		// Test for duplicates
+		for _, url := range urls {
+			require.False(t, deduper.Add(url), "URL should have been identified as duplicate: %s", url)
+		}
+	})
+}
--- a/pkg/input/provider/list/hmap.go
+++ b/pkg/input/provider/list/hmap.go
@ -19,6 +19,7 @@ import (
 	"github.com/projectdiscovery/hmap/filekv"
 	"github.com/projectdiscovery/hmap/store/hybrid"
 	"github.com/projectdiscovery/mapcidr/asn"
+	"github.com/projectdiscovery/nuclei/v3/pkg/input/provider/dedupe"
 	providerTypes "github.com/projectdiscovery/nuclei/v3/pkg/input/types"
 	"github.com/projectdiscovery/nuclei/v3/pkg/protocols/common/contextargs"
 	"github.com/projectdiscovery/nuclei/v3/pkg/protocols/common/protocolstate"
@ -48,6 +49,8 @@ type ListInputProvider struct {
 	hostMapStream     *filekv.FileDB
 	hostMapStreamOnce sync.Once
 	sync.Once
+
+	fuzzDeduper *dedupe.FuzzingDeduper
 }

 // Options is a wrapper around types.Options structure
@ -78,6 +81,9 @@ func New(opts *Options) (*ListInputProvider, error) {
 		},
 		excludedHosts: make(map[string]struct{}),
 	}
+	if options.FuzzingDedupe {
+		input.fuzzDeduper = dedupe.NewFuzzingDeduper()
+	}
 	if options.Stream {
 		fkvOptions := filekv.DefaultOptions
 		fkvOptions.MaxItems = DefaultMaxDedupeItemsCount
@ -472,6 +478,12 @@ func (i *ListInputProvider) setItem(metaInput *contextargs.MetaInput) {
 	}

 	i.inputCount++ // tracks target count
+	if i.fuzzDeduper != nil {
+		if !i.fuzzDeduper.Add(metaInput.Target()) {
+			gologger.Verbose().Msgf("Ignoring duplicate fuzzing target: %s\n", metaInput.Target())
+			return
+		}
+	}
 	_ = i.hostMap.Set(key, nil)
 	if i.hostMapStream != nil {
 		i.setHostMapStream(key)
--- a/pkg/types/types.go
+++ b/pkg/types/types.go
@ -276,6 +276,8 @@ type Options struct {
 	StoreResponseDir string
 	// DisableRedirects disables following redirects for http request module
 	DisableRedirects bool
+	// FuzzingDedupe enables deduplication of input URLs for fuzzing
+	FuzzingDedupe bool
 	// SNI custom hostname
 	SNI string
 	// InputFileMode specifies the mode of input file (jsonl, burp, openapi, swagger, etc)