Skip to content

Commit

Permalink
minor refactor to remove duplicated code
Browse files Browse the repository at this point in the history
  • Loading branch information
tarunKoyalwar committed Dec 5, 2023
1 parent de01da7 commit 69a2964
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 64 deletions.
178 changes: 116 additions & 62 deletions url/url.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,6 @@ import (
stringsutil "github.com/projectdiscovery/utils/strings"
)

// disables autocorrect related to parsing
// Ex: if input is admin url.Parse considers admin as host which is not a valid domain name
var DisableAutoCorrect bool

// disables autocorrect related to url.path
// Ex: if input is admin url.Parse considers admin as path(when DisableAutoCorrect is false) and adds missing prefix('/')
// On setting DisablePathAutoCorrection to true prefix is not added
var DisablePathAutoCorrection bool

// URL a wrapper around net/url.URL
type URL struct {
*url.URL
Expand All @@ -28,6 +19,7 @@ type URL struct {
IsRelative bool // If URL is relative
Params *OrderedParams // Query Parameters
// should call Update() method when directly updating wrapped url.URL or parameters
disableAutoCorrect bool // when true any type of autocorrect is disabled
}

// mergepath merges given relative path
Expand Down Expand Up @@ -80,15 +72,15 @@ func (u *URL) Clone() *URL {
}
}
ux := &url.URL{
Scheme: u.Scheme,
Opaque: u.Opaque,
User: userinfo,
Host: u.Host,
Path: u.Path,
RawPath: u.RawPath,
RawQuery: u.RawQuery,
Fragment: u.Fragment,
// OmitHost: u.OmitHost, // only supported in 1.19
Scheme: u.Scheme,
Opaque: u.Opaque,
User: userinfo,
Host: u.Host,
Path: u.Path,
RawPath: u.RawPath,
RawQuery: u.RawQuery,
Fragment: u.Fragment,
OmitHost: u.OmitHost, // only supported in 1.19
ForceQuery: u.ForceQuery,
RawFragment: u.RawFragment,
}
Expand Down Expand Up @@ -177,7 +169,12 @@ func (u *URL) parseUnsafeRelativePath() {
// ex: /%20test%0a =?
// autocorrect if prefix is missing
defer func() {
if !DisablePathAutoCorrection && !strings.HasPrefix(u.Path, "/") && u.Path != "" {
// u.Path (stdlib) is vague related to path i.e `path (relative paths may omit leading slash)`
// relative paths can have `/` prefix or not but this causes lot of edgecases as we have already
// seen i.e why we have two dedicated parsers for this
// ParseRelativePath --> always adds `/` if it is missing
// ParseRawRelativePath --> No normalizations like adding `/`
if !u.disableAutoCorrect && !strings.HasPrefix(u.Path, "/") && u.Path != "" {
u.Path = "/" + u.Path
}
}()
Expand Down Expand Up @@ -225,43 +222,107 @@ func (u *URL) fetchParams() {
u.Update()
}

// ParseURL
// ParseURL (can be relative or absolute)
func Parse(inputURL string) (*URL, error) {
return ParseURL(inputURL, false)
}

// Parse and return URL
// Parse and return URL (can be relative or absolute)
func ParseURL(inputURL string, unsafe bool) (*URL, error) {
u := &URL{
URL: &url.URL{},
Original: inputURL,
Unsafe: unsafe,
Params: NewOrderedParams(),
}
var err error
u, err = absoluteURLParser(u)
if err != nil {
return nil, err
}
if u.IsRelative {
return ParseRelativePath(inputURL, unsafe)
}

// logical bug url is not relative but host is empty
if u.Host == "" {
return nil, errorutil.NewWithTag("urlutil", "failed to parse url `%v`", inputURL).Msgf("got empty host when url is not relative")
}

// # Normalization 1: if value of u.Host does not look like a common domain
// it is most likely a relative path parsed as host
// this happens because of ambiguity of url.Parse
// because
// when parsing url like scanme.sh/my/path url.Parse() puts `scanme.sh/my/path` as path and host is empty
// to avoid this we always parse url with a schema prefix if it is missing (ex: https:// is not in input url) and then
// rule out the possiblity that given url is not a relative path
// this handles below edgecase
// u , err := url.Parse(`mypath`)

if !strings.Contains(u.Host, ".") && !strings.Contains(u.Host, ":") && u.Host != "localhost" {
// TODO: should use a proper regex to validate hostname/ip
// currently domain names without (.) are not considered as valid and autocorrected
// this does not look like a valid domain , ipv4 or ipv6
// consider it as relative
// use ParseAbosluteURL to avoid this issue
u.IsRelative = true
u.Path = inputURL
u.Host = ""
}

return u, nil
}

// ParseAbsoluteURL parses and returns absolute url
// should be preferred over others when input is known to be absolute url
// this reduces any normalization and autocorrection related to relative paths
// and returns error if input is relative path
func ParseAbsoluteURL(inputURL string, unsafe bool) (*URL, error) {
u := &URL{
URL: &url.URL{},
Original: inputURL,
Unsafe: unsafe,
Params: NewOrderedParams(),
}
var err error
u, err = absoluteURLParser(u)
if err != nil {
return nil, err
}
if u.IsRelative {
return nil, errorutil.NewWithTag("urlutil", "expected absolute url but got relative url input=%v,path=%v", inputURL, u.Path)
}
if u.URL.Host == "" {
return nil, errorutil.NewWithTag("urlutil", "something went wrong got empty host for absolute url=%v", inputURL)
}
return u, nil
}

// absoluteURLParser is common absolute parser logic used to avoid duplication of code
func absoluteURLParser(u *URL) (*URL, error) {
u.fetchParams()
// filter out fragments and parameters only then parse path
// we use u.Original because u.fetchParams() parses fragments and parameters
// from u.Original (this is done to preserve query order in params and other edgecases)
inputURL = u.Original
if inputURL == "" {
if u.Original == "" {
return nil, errorutil.NewWithTag("urlutil", "failed to parse url got empty input")
}

// Note: we consider //scanme.sh as valid (since all browsers accept this <script src="//ajax.googleapis.com/ajax/xx">)
if strings.HasPrefix(inputURL, "/") && !strings.HasPrefix(inputURL, "//") {
if strings.HasPrefix(u.Original, "/") && !strings.HasPrefix(u.Original, "//") {
// this is definitely a relative path
u.IsRelative = true
u.Path = u.Original
return u, nil
}
// Try to parse host related input
if stringsutil.HasPrefixAny(inputURL, HTTP+SchemeSeparator, HTTPS+SchemeSeparator, "//") || strings.Contains(inputURL, "://") {
if stringsutil.HasPrefixAny(u.Original, HTTP+SchemeSeparator, HTTPS+SchemeSeparator, "//") {
u.IsRelative = false
urlparse, parseErr := url.Parse(inputURL)
urlparse, parseErr := url.Parse(u.Original)
if parseErr != nil {
// for parse errors in unsafe way try parsing again
if unsafe {
urlparse = parseUnsafeFullURL(inputURL)
if u.Unsafe {
urlparse = parseUnsafeFullURL(u.Original)
if urlparse != nil {
parseErr = nil
}
Expand All @@ -274,7 +335,7 @@ func ParseURL(inputURL string, unsafe bool) (*URL, error) {
} else {
// if no prefix try to parse it with https
// if failed we consider it as a relative path and not a full url
urlparse, parseErr := url.Parse(HTTPS + SchemeSeparator + inputURL)
urlparse, parseErr := url.Parse(HTTPS + SchemeSeparator + u.Original)
if parseErr != nil {
// most likely a relativeurl
u.IsRelative = true
Expand All @@ -284,63 +345,56 @@ func ParseURL(inputURL string, unsafe bool) (*URL, error) {
copy(u.URL, urlparse)
}
}

// try parsing path
if !u.IsRelative {
// if parsing is successful validate and autocorrect
//ex: when inputURL is admin `url.Parse()` considers admin as Host with parsed with https://
// i.e https://admin which is not valid/accepted domain
//TODO: Properly Validate using regex
if u.Host == "" {
// this is unexpected case return err
return nil, errorutil.NewWithTag("urlutil", "failed to parse url %v got empty host", inputURL)
}
// TODO: should use a proper regex to validate hostname/ip
// currently domain names without (.) are not considered as valid and autocorrected
// if DisableAutoCorrect is false
if !strings.Contains(u.Host, ".") && !strings.Contains(u.Host, ":") && u.Host != "localhost" {
// this does not look like a valid domain , ipv4 or ipv6
// consider it as relative
if !DisableAutoCorrect {
u.IsRelative = true
u.Path = inputURL
u.Host = ""
}
}
}
if !u.IsRelative && u.Host == "" {
return nil, errorutil.NewWithTag("urlutil", "failed to parse url `%v`", inputURL).Msgf("got empty host when url is not relative")
}
if u.IsRelative {
return ParseRelativePath(inputURL, unsafe)
}
return u, nil
}

// ParseRelativePath parses and returns relative path
// should be preferred over others when input is known to be relative path
// this reduces any normalization and autocorrection related to absolute paths
// and returns error if input is absolute path
func ParseRelativePath(inputURL string, unsafe bool) (*URL, error) {
u := &URL{
URL: &url.URL{},
Original: inputURL,
Unsafe: unsafe,
IsRelative: true,
}
return relativePathParser(u)
}

// ParseRelativePath
func ParseRawRelativePath(inputURL string, unsafe bool) (*URL, error) {
u := &URL{
URL: &url.URL{},
Original: inputURL,
Unsafe: unsafe,
IsRelative: true,
disableAutoCorrect: true,
}
return relativePathParser(u)
}

// relativePathParser is common relative path parser logic used to avoid duplication of code
func relativePathParser(u *URL) (*URL, error) {
u.fetchParams()
urlparse, parseErr := url.Parse(inputURL)
urlparse, parseErr := url.Parse(u.Original)
if parseErr != nil {
if !unsafe {
if !u.Unsafe {
// should return error if not unsafe url
return nil, errorutil.NewWithErr(parseErr).WithTag("urlutil").Msgf("failed to parse input url")
} else {
// if unsafe do not rely on net/url.Parse
u.Path = inputURL
u.Path = u.Original
}
}
if urlparse != nil {
urlparse.Host = ""
copy(u.URL, urlparse)
}
u.parseUnsafeRelativePath()
if u.Host != "" {
return nil, errorutil.NewWithTag("urlutil", "expected relative path but got absolute path with host=%v,input=%v", u.Host, u.Original)
}
return u, nil
}

Expand Down
3 changes: 1 addition & 2 deletions url/url_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,9 @@ func TestUnicodeEscapeWithUnsafe(t *testing.T) {
}{
{"https://scanme.sh/%u002e%u002e/%u002e%u002e/1.txt.it", "https://scanme.sh/%u002e%u002e/%u002e%u002e/1.txt.it"},
}
DisableAutoCorrect = true

for _, v := range testcases {
urlx, err := ParseURL(v.input, true)
urlx, err := ParseAbsoluteURL(v.input, true)
require.Nilf(t, err, "got error for url %v", v.input)
require.Equal(t, v.expected, urlx.String())
}
Expand Down

0 comments on commit 69a2964

Please sign in to comment.