minor refactor to remove duplicated code

projectdiscovery · Dec 5, 2023 · 69a2964 · 69a2964
1 parent de01da7
commit 69a2964
Show file tree

Hide file tree

Showing 2 changed files with 117 additions and 64 deletions.
diff --git a/url/url.go b/url/url.go
@@ -10,15 +10,6 @@ import (
 	stringsutil "github.com/projectdiscovery/utils/strings"
 )
 
-// disables autocorrect related to parsing
-// Ex: if input is admin url.Parse considers admin as host which is not a valid domain name
-var DisableAutoCorrect bool
-
-// disables autocorrect related to url.path
-// Ex: if input is admin url.Parse considers admin as path(when DisableAutoCorrect is false) and adds missing prefix('/')
-// On setting DisablePathAutoCorrection to true prefix is not added
-var DisablePathAutoCorrection bool
-
 // URL a wrapper around net/url.URL
 type URL struct {
 	*url.URL
@@ -28,6 +19,7 @@ type URL struct {
 	IsRelative bool           // If URL is relative
 	Params     *OrderedParams // Query Parameters
 	// should call Update() method when directly updating wrapped url.URL or parameters
+	disableAutoCorrect bool // when true any type of autocorrect is disabled
 }
 
 // mergepath merges given relative path
@@ -80,15 +72,15 @@ func (u *URL) Clone() *URL {
 		}
 	}
 	ux := &url.URL{
-		Scheme:   u.Scheme,
-		Opaque:   u.Opaque,
-		User:     userinfo,
-		Host:     u.Host,
-		Path:     u.Path,
-		RawPath:  u.RawPath,
-		RawQuery: u.RawQuery,
-		Fragment: u.Fragment,
-		// OmitHost:    u.OmitHost, // only supported in 1.19
+		Scheme:      u.Scheme,
+		Opaque:      u.Opaque,
+		User:        userinfo,
+		Host:        u.Host,
+		Path:        u.Path,
+		RawPath:     u.RawPath,
+		RawQuery:    u.RawQuery,
+		Fragment:    u.Fragment,
+		OmitHost:    u.OmitHost, // only supported in 1.19
 		ForceQuery:  u.ForceQuery,
 		RawFragment: u.RawFragment,
 	}
@@ -177,7 +169,12 @@ func (u *URL) parseUnsafeRelativePath() {
 	// ex: /%20test%0a =?
 	// autocorrect if prefix is missing
 	defer func() {
-		if !DisablePathAutoCorrection && !strings.HasPrefix(u.Path, "/") && u.Path != "" {
+		// u.Path (stdlib) is vague related to path i.e `path (relative paths may omit leading slash)`
+		// relative paths can have `/` prefix or not but this causes lot of edgecases as we have already
+		// seen i.e why we have two dedicated parsers for this
+		// ParseRelativePath --> always adds `/` if it is missing
+		// ParseRawRelativePath --> No normalizations like adding `/`
+		if !u.disableAutoCorrect && !strings.HasPrefix(u.Path, "/") && u.Path != "" {
 			u.Path = "/" + u.Path
 		}
 	}()
@@ -225,43 +222,107 @@ func (u *URL) fetchParams() {
 	u.Update()
 }
 
-// ParseURL
+// ParseURL (can be relative or absolute)
 func Parse(inputURL string) (*URL, error) {
 	return ParseURL(inputURL, false)
 }
 
-// Parse and return URL
+// Parse and return URL (can be relative or absolute)
 func ParseURL(inputURL string, unsafe bool) (*URL, error) {
 	u := &URL{
 		URL:      &url.URL{},
 		Original: inputURL,
 		Unsafe:   unsafe,
 		Params:   NewOrderedParams(),
 	}
+	var err error
+	u, err = absoluteURLParser(u)
+	if err != nil {
+		return nil, err
+	}
+	if u.IsRelative {
+		return ParseRelativePath(inputURL, unsafe)
+	}
+
+	// logical bug url is not relative but host is empty
+	if u.Host == "" {
+		return nil, errorutil.NewWithTag("urlutil", "failed to parse url `%v`", inputURL).Msgf("got empty host when url is not relative")
+	}
+
+	// # Normalization 1: if value of u.Host does not look like a common domain
+	// it is most likely a relative path parsed as host
+	// this happens because of ambiguity of url.Parse
+	// because
+	// when parsing url like scanme.sh/my/path url.Parse() puts `scanme.sh/my/path` as path and host is empty
+	// to avoid this we always parse url with a schema prefix if it is missing (ex: https:// is not in input url) and then
+	// rule out the possiblity that given url is not a relative path
+	// this handles below edgecase
+	// u , err :=  url.Parse(`mypath`)
+
+	if !strings.Contains(u.Host, ".") && !strings.Contains(u.Host, ":") && u.Host != "localhost" {
+		// TODO: should use a proper regex to validate hostname/ip
+		// currently domain names without (.) are not considered as valid and autocorrected
+		// this does not look like a valid domain , ipv4 or ipv6
+		// consider it as relative
+		// use ParseAbosluteURL to avoid this issue
+		u.IsRelative = true
+		u.Path = inputURL
+		u.Host = ""
+	}
+
+	return u, nil
+}
+
+// ParseAbsoluteURL parses and returns absolute url
+// should be preferred over others when input is known to be absolute url
+// this reduces any normalization and autocorrection related to relative paths
+// and returns error if input is relative path
+func ParseAbsoluteURL(inputURL string, unsafe bool) (*URL, error) {
+	u := &URL{
+		URL:      &url.URL{},
+		Original: inputURL,
+		Unsafe:   unsafe,
+		Params:   NewOrderedParams(),
+	}
+	var err error
+	u, err = absoluteURLParser(u)
+	if err != nil {
+		return nil, err
+	}
+	if u.IsRelative {
+		return nil, errorutil.NewWithTag("urlutil", "expected absolute url but got relative url input=%v,path=%v", inputURL, u.Path)
+	}
+	if u.URL.Host == "" {
+		return nil, errorutil.NewWithTag("urlutil", "something went wrong got empty host for absolute url=%v", inputURL)
+	}
+	return u, nil
+}
+
+// absoluteURLParser is common absolute parser logic used to avoid duplication of code
+func absoluteURLParser(u *URL) (*URL, error) {
 	u.fetchParams()
 	// filter out fragments and parameters only then parse path
 	// we use u.Original because u.fetchParams() parses fragments and parameters
 	// from u.Original (this is done to preserve query order in params and other edgecases)
-	inputURL = u.Original
-	if inputURL == "" {
+	if u.Original == "" {
 		return nil, errorutil.NewWithTag("urlutil", "failed to parse url got empty input")
 	}
 
 	// Note: we consider //scanme.sh as valid  (since all browsers accept this <script src="//ajax.googleapis.com/ajax/xx">)
-	if strings.HasPrefix(inputURL, "/") && !strings.HasPrefix(inputURL, "//") {
+	if strings.HasPrefix(u.Original, "/") && !strings.HasPrefix(u.Original, "//") {
 		// this is definitely a relative path
 		u.IsRelative = true
 		u.Path = u.Original
 		return u, nil
 	}
 	// Try to parse host related input
-	if stringsutil.HasPrefixAny(inputURL, HTTP+SchemeSeparator, HTTPS+SchemeSeparator, "//") || strings.Contains(inputURL, "://") {
+	if stringsutil.HasPrefixAny(u.Original, HTTP+SchemeSeparator, HTTPS+SchemeSeparator, "//") {
 		u.IsRelative = false
-		urlparse, parseErr := url.Parse(inputURL)
+		urlparse, parseErr := url.Parse(u.Original)
 		if parseErr != nil {
 			// for parse errors in unsafe way try parsing again
-			if unsafe {
-				urlparse = parseUnsafeFullURL(inputURL)
+			if u.Unsafe {
+				urlparse = parseUnsafeFullURL(u.Original)
 				if urlparse != nil {
 					parseErr = nil
 				}
@@ -274,7 +335,7 @@ func ParseURL(inputURL string, unsafe bool) (*URL, error) {
 	} else {
 		// if no prefix try to parse it with https
 		// if failed we consider it as a relative path and not a full url
-		urlparse, parseErr := url.Parse(HTTPS + SchemeSeparator + inputURL)
+		urlparse, parseErr := url.Parse(HTTPS + SchemeSeparator + u.Original)
 		if parseErr != nil {
 			// most likely a relativeurl
 			u.IsRelative = true
@@ -284,63 +345,56 @@ func ParseURL(inputURL string, unsafe bool) (*URL, error) {
 			copy(u.URL, urlparse)
 		}
 	}
-
-	// try parsing path
-	if !u.IsRelative {
-		// if parsing is successful validate and autocorrect
-		//ex: when inputURL is admin `url.Parse()` considers admin as Host with parsed with https://
-		// i.e https://admin which is not valid/accepted domain
-		//TODO: Properly Validate using regex
-		if u.Host == "" {
-			// this is unexpected case return err
-			return nil, errorutil.NewWithTag("urlutil", "failed to parse url %v got empty host", inputURL)
-		}
-		// TODO: should use a proper regex to validate hostname/ip
-		// currently domain names without (.) are not considered as valid and autocorrected
-		// if DisableAutoCorrect is false
-		if !strings.Contains(u.Host, ".") && !strings.Contains(u.Host, ":") && u.Host != "localhost" {
-			// this does not look like a valid domain , ipv4 or ipv6
-			// consider it as relative
-			if !DisableAutoCorrect {
-				u.IsRelative = true
-				u.Path = inputURL
-				u.Host = ""
-			}
-		}
-	}
-	if !u.IsRelative && u.Host == "" {
-		return nil, errorutil.NewWithTag("urlutil", "failed to parse url `%v`", inputURL).Msgf("got empty host when url is not relative")
-	}
-	if u.IsRelative {
-		return ParseRelativePath(inputURL, unsafe)
-	}
 	return u, nil
 }
 
 // ParseRelativePath parses and returns relative path
+// should be preferred over others when input is known to be relative path
+// this reduces any normalization and autocorrection related to absolute paths
+// and returns error if input is absolute path
 func ParseRelativePath(inputURL string, unsafe bool) (*URL, error) {
 	u := &URL{
 		URL:        &url.URL{},
 		Original:   inputURL,
 		Unsafe:     unsafe,
 		IsRelative: true,
 	}
+	return relativePathParser(u)
+}
+
+// ParseRelativePath
+func ParseRawRelativePath(inputURL string, unsafe bool) (*URL, error) {
+	u := &URL{
+		URL:                &url.URL{},
+		Original:           inputURL,
+		Unsafe:             unsafe,
+		IsRelative:         true,
+		disableAutoCorrect: true,
+	}
+	return relativePathParser(u)
+}
+
+// relativePathParser is common relative path parser logic used to avoid duplication of code
+func relativePathParser(u *URL) (*URL, error) {
 	u.fetchParams()
-	urlparse, parseErr := url.Parse(inputURL)
+	urlparse, parseErr := url.Parse(u.Original)
 	if parseErr != nil {
-		if !unsafe {
+		if !u.Unsafe {
 			// should return error if not unsafe url
 			return nil, errorutil.NewWithErr(parseErr).WithTag("urlutil").Msgf("failed to parse input url")
 		} else {
 			// if unsafe do not rely on net/url.Parse
-			u.Path = inputURL
+			u.Path = u.Original
 		}
 	}
 	if urlparse != nil {
 		urlparse.Host = ""
 		copy(u.URL, urlparse)
 	}
 	u.parseUnsafeRelativePath()
+	if u.Host != "" {
+		return nil, errorutil.NewWithTag("urlutil", "expected relative path but got absolute path with host=%v,input=%v", u.Host, u.Original)
+	}
 	return u, nil
 }
 

diff --git a/url/url_test.go b/url/url_test.go
@@ -190,10 +190,9 @@ func TestUnicodeEscapeWithUnsafe(t *testing.T) {
 	}{
 		{"https://scanme.sh/%u002e%u002e/%u002e%u002e/1.txt.it", "https://scanme.sh/%u002e%u002e/%u002e%u002e/1.txt.it"},
 	}
-	DisableAutoCorrect = true
 
 	for _, v := range testcases {
-		urlx, err := ParseURL(v.input, true)
+		urlx, err := ParseAbsoluteURL(v.input, true)
 		require.Nilf(t, err, "got error for url %v", v.input)
 		require.Equal(t, v.expected, urlx.String())
 	}