From 69a2964357f4653c3f23caaa5b9ea0cf8cf1e4d1 Mon Sep 17 00:00:00 2001 From: Tarun Koyalwar Date: Tue, 5 Dec 2023 21:56:29 +0530 Subject: [PATCH] minor refactor to remove duplicated code --- url/url.go | 178 +++++++++++++++++++++++++++++++----------------- url/url_test.go | 3 +- 2 files changed, 117 insertions(+), 64 deletions(-) diff --git a/url/url.go b/url/url.go index bb0c1a4..3ca51ce 100644 --- a/url/url.go +++ b/url/url.go @@ -10,15 +10,6 @@ import ( stringsutil "github.com/projectdiscovery/utils/strings" ) -// disables autocorrect related to parsing -// Ex: if input is admin url.Parse considers admin as host which is not a valid domain name -var DisableAutoCorrect bool - -// disables autocorrect related to url.path -// Ex: if input is admin url.Parse considers admin as path(when DisableAutoCorrect is false) and adds missing prefix('/') -// On setting DisablePathAutoCorrection to true prefix is not added -var DisablePathAutoCorrection bool - // URL a wrapper around net/url.URL type URL struct { *url.URL @@ -28,6 +19,7 @@ type URL struct { IsRelative bool // If URL is relative Params *OrderedParams // Query Parameters // should call Update() method when directly updating wrapped url.URL or parameters + disableAutoCorrect bool // when true any type of autocorrect is disabled } // mergepath merges given relative path @@ -80,15 +72,15 @@ func (u *URL) Clone() *URL { } } ux := &url.URL{ - Scheme: u.Scheme, - Opaque: u.Opaque, - User: userinfo, - Host: u.Host, - Path: u.Path, - RawPath: u.RawPath, - RawQuery: u.RawQuery, - Fragment: u.Fragment, - // OmitHost: u.OmitHost, // only supported in 1.19 + Scheme: u.Scheme, + Opaque: u.Opaque, + User: userinfo, + Host: u.Host, + Path: u.Path, + RawPath: u.RawPath, + RawQuery: u.RawQuery, + Fragment: u.Fragment, + OmitHost: u.OmitHost, // only supported in 1.19 ForceQuery: u.ForceQuery, RawFragment: u.RawFragment, } @@ -177,7 +169,12 @@ func (u *URL) parseUnsafeRelativePath() { // ex: /%20test%0a =? // autocorrect if prefix is missing defer func() { - if !DisablePathAutoCorrection && !strings.HasPrefix(u.Path, "/") && u.Path != "" { + // u.Path (stdlib) is vague related to path i.e `path (relative paths may omit leading slash)` + // relative paths can have `/` prefix or not but this causes lot of edgecases as we have already + // seen i.e why we have two dedicated parsers for this + // ParseRelativePath --> always adds `/` if it is missing + // ParseRawRelativePath --> No normalizations like adding `/` + if !u.disableAutoCorrect && !strings.HasPrefix(u.Path, "/") && u.Path != "" { u.Path = "/" + u.Path } }() @@ -225,12 +222,12 @@ func (u *URL) fetchParams() { u.Update() } -// ParseURL +// ParseURL (can be relative or absolute) func Parse(inputURL string) (*URL, error) { return ParseURL(inputURL, false) } -// Parse and return URL +// Parse and return URL (can be relative or absolute) func ParseURL(inputURL string, unsafe bool) (*URL, error) { u := &URL{ URL: &url.URL{}, @@ -238,30 +235,94 @@ func ParseURL(inputURL string, unsafe bool) (*URL, error) { Unsafe: unsafe, Params: NewOrderedParams(), } + var err error + u, err = absoluteURLParser(u) + if err != nil { + return nil, err + } + if u.IsRelative { + return ParseRelativePath(inputURL, unsafe) + } + + // logical bug url is not relative but host is empty + if u.Host == "" { + return nil, errorutil.NewWithTag("urlutil", "failed to parse url `%v`", inputURL).Msgf("got empty host when url is not relative") + } + + // # Normalization 1: if value of u.Host does not look like a common domain + // it is most likely a relative path parsed as host + // this happens because of ambiguity of url.Parse + // because + // when parsing url like scanme.sh/my/path url.Parse() puts `scanme.sh/my/path` as path and host is empty + // to avoid this we always parse url with a schema prefix if it is missing (ex: https:// is not in input url) and then + // rule out the possiblity that given url is not a relative path + // this handles below edgecase + // u , err := url.Parse(`mypath`) + + if !strings.Contains(u.Host, ".") && !strings.Contains(u.Host, ":") && u.Host != "localhost" { + // TODO: should use a proper regex to validate hostname/ip + // currently domain names without (.) are not considered as valid and autocorrected + // this does not look like a valid domain , ipv4 or ipv6 + // consider it as relative + // use ParseAbosluteURL to avoid this issue + u.IsRelative = true + u.Path = inputURL + u.Host = "" + } + + return u, nil +} + +// ParseAbsoluteURL parses and returns absolute url +// should be preferred over others when input is known to be absolute url +// this reduces any normalization and autocorrection related to relative paths +// and returns error if input is relative path +func ParseAbsoluteURL(inputURL string, unsafe bool) (*URL, error) { + u := &URL{ + URL: &url.URL{}, + Original: inputURL, + Unsafe: unsafe, + Params: NewOrderedParams(), + } + var err error + u, err = absoluteURLParser(u) + if err != nil { + return nil, err + } + if u.IsRelative { + return nil, errorutil.NewWithTag("urlutil", "expected absolute url but got relative url input=%v,path=%v", inputURL, u.Path) + } + if u.URL.Host == "" { + return nil, errorutil.NewWithTag("urlutil", "something went wrong got empty host for absolute url=%v", inputURL) + } + return u, nil +} + +// absoluteURLParser is common absolute parser logic used to avoid duplication of code +func absoluteURLParser(u *URL) (*URL, error) { u.fetchParams() // filter out fragments and parameters only then parse path // we use u.Original because u.fetchParams() parses fragments and parameters // from u.Original (this is done to preserve query order in params and other edgecases) - inputURL = u.Original - if inputURL == "" { + if u.Original == "" { return nil, errorutil.NewWithTag("urlutil", "failed to parse url got empty input") } // Note: we consider //scanme.sh as valid (since all browsers accept this