Skip to content

Commit

Permalink
Improve
Browse files Browse the repository at this point in the history
  • Loading branch information
theblackturtle committed Feb 11, 2020
1 parent 1c5cdad commit 008623d
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 60 deletions.
17 changes: 9 additions & 8 deletions core/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ type Crawler struct {
urlSet *stringset.StringFilter
formSet *stringset.StringFilter

site string
site *url.URL
domain string
}

func NewCrawler(site string, cmd *cobra.Command) *Crawler {
func NewCrawler(site *url.URL, cmd *cobra.Command) *Crawler {
domain := GetDomain(site)
if domain == "" {
Logger.Error("Failed to parse domain")
Expand Down Expand Up @@ -160,7 +160,7 @@ func NewCrawler(site string, cmd *cobra.Command) *Crawler {
var output *Output
outputFolder, _ := cmd.Flags().GetString("output")
if outputFolder != "" {
filename := strings.ReplaceAll(GetHostname(site), ".", "_")
filename := strings.ReplaceAll(site.Hostname(), ".", "_")
output = NewOutput(outputFolder, filename)
}

Expand Down Expand Up @@ -274,11 +274,11 @@ func (crawler *Crawler) Start() {
// If JS file is minimal format. Try to find original format
if strings.Contains(jsFileUrl, ".min.js") {
originalJS := strings.ReplaceAll(jsFileUrl, ".min.js", ".js")
crawler.linkFinder(crawler.site, originalJS)
crawler.linkFinder(originalJS)
}

// Request and Get JS link
crawler.linkFinder(crawler.site, jsFileUrl)
crawler.linkFinder(jsFileUrl)
}
})

Expand All @@ -298,6 +298,7 @@ func (crawler *Crawler) Start() {
})

crawler.C.OnError(func(response *colly.Response, err error) {
Logger.Debugf("Error request: %s - Status code: %v - Error: %s", response.Request.URL.String(), response.StatusCode, err)
// Status == 0 mean "The server IP address could not be found."
if response.StatusCode == 404 || response.StatusCode == 429 || response.StatusCode == 0 {
return
Expand All @@ -317,7 +318,7 @@ func (crawler *Crawler) Start() {
}
})

_ = crawler.C.Visit(crawler.site)
_ = crawler.C.Visit(crawler.site.String())
}

// Find subdomains from response
Expand Down Expand Up @@ -350,7 +351,7 @@ func (crawler *Crawler) findAWSS3(resp string) {

// This function will request and parse external javascript
// and pass to main collector with scope setup
func (crawler *Crawler) linkFinder(site string, jsUrl string) {
func (crawler *Crawler) linkFinder(jsUrl string) {
client := http.Client{Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}}
resp, err := client.Get(jsUrl)
if err != nil || resp.StatusCode != 200 {
Expand Down Expand Up @@ -398,6 +399,6 @@ func (crawler *Crawler) linkFinder(site string, jsUrl string) {
crawler.Output.WriteToFile(outputFormat)
}
// Try to request JS path
_ = crawler.C.Visit(FixUrl(link, site))
_ = crawler.C.Visit(FixUrl(link, crawler.site))
}
}
4 changes: 2 additions & 2 deletions core/linkfinder.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ import (
"strings"
)

var linkFinderRegex = regexp.MustCompile(`(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|/][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:\?[^"|^']{0,}|)))(?:"|')`)
var linkFinderRegex = regexp.MustCompile(`(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')`)

func LinkFinder(source string) ([]string, error) {
var links []string
source = strings.ToLower(source)
//source = strings.ToLower(source)
if len(source) > 1000000 {
source = strings.ReplaceAll(source, ";", ";\r\n")
source = strings.ReplaceAll(source, ",", ",\r\n")
Expand Down
8 changes: 4 additions & 4 deletions core/othersource_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ import "testing"
var domain = "yahoo.com"

func TestOtherSources(t *testing.T) {
urls := OtherSources(domain,false)
urls := OtherSources(domain, false)
t.Log(len(urls))
t.Log(urls)
}

func TestGetCommonCrawlURLs(t *testing.T) {
urls, err := getCommonCrawlURLs(domain,false)
urls, err := getCommonCrawlURLs(domain, false)
if err != nil {
t.Fatal(err)
}
Expand All @@ -20,7 +20,7 @@ func TestGetCommonCrawlURLs(t *testing.T) {
}

func TestGetVirusTotalURLs(t *testing.T) {
urls, err := getVirusTotalURLs(domain,false)
urls, err := getVirusTotalURLs(domain, false)
if err != nil {
t.Fatal(err)
}
Expand All @@ -29,7 +29,7 @@ func TestGetVirusTotalURLs(t *testing.T) {
}

func TestGetWaybackURLs(t *testing.T) {
urls, err := getWaybackURLs(domain,false)
urls, err := getWaybackURLs(domain, false)
if err != nil {
t.Fatal(err)
}
Expand Down
9 changes: 5 additions & 4 deletions core/robots.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,22 @@ import (
"github.com/gocolly/colly"
"io/ioutil"
"net/http"
"net/url"
"regexp"
"strings"
"sync"
)

func ParseRobots(site string, depth int, output *Output, c *colly.Collector, wg *sync.WaitGroup) {
func ParseRobots(site *url.URL, output *Output, c *colly.Collector, wg *sync.WaitGroup) {
defer wg.Done()
robotsURL := site + "/robots.txt"
robotsURL := site.String() + "/robots.txt"

resp, err := http.Get(robotsURL)
if err != nil {
return
}
if resp.StatusCode == 200 {
Logger.Infof("Found %s/robots.txt", site)
Logger.Infof("Found robots.txt: %s", robotsURL)
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return
Expand All @@ -30,7 +31,7 @@ func ParseRobots(site string, depth int, output *Output, c *colly.Collector, wg
for _, line := range lines {
if strings.Contains(line, "llow: ") {
url := re.ReplaceAllString(line, "")
url = site + url
url = FixUrl(url, site)
outputFormat := fmt.Sprintf("[robots] - %s", url)
fmt.Println(outputFormat)
if output != nil {
Expand Down
7 changes: 4 additions & 3 deletions core/sitemap.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,20 @@ import (
"fmt"
"github.com/gocolly/colly"
sitemap "github.com/oxffaa/gopher-parse-sitemap"
"net/url"
"sync"
)

func ParseSiteMap(site string, depth int, output *Output, c *colly.Collector, wg *sync.WaitGroup) {
func ParseSiteMap(site *url.URL, output *Output, c *colly.Collector, wg *sync.WaitGroup) {
defer wg.Done()
sitemapUrls := []string{"/sitemap.xml", "/sitemap_news.xml", "/sitemap_index.xml", "/sitemap-index.xml", "/sitemapindex.xml",
"/sitemap-news.xml", "/post-sitemap.xml", "/page-sitemap.xml", "/portfolio-sitemap.xml", "/home_slider-sitemap.xml", "/category-sitemap.xml",
"/author-sitemap.xml"}

for _, path := range sitemapUrls {
// Ignore error when that not valid sitemap.xml path
Logger.Infof("Trying to find %s", site+path)
_ = sitemap.ParseFromSite(site+path, func(entry sitemap.Entry) error {
Logger.Infof("Trying to find %s", site.String()+path)
_ = sitemap.ParseFromSite(site.String()+path, func(entry sitemap.Entry) error {
outputFormat := fmt.Sprintf("[sitemap] - %s", entry.GetLocation())
fmt.Println(outputFormat)
if output != nil {
Expand Down
53 changes: 28 additions & 25 deletions core/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,44 +21,42 @@ func GetRawCookie(cookies []*http.Cookie) string {
return strings.Join(rawCookies, "; ")
}

func GetDomain(s string) string {
u, err := url.Parse(s)
if err != nil {
return ""
}
domain, err := publicsuffix.EffectiveTLDPlusOne(u.Hostname())
func GetDomain(site *url.URL) string {
domain, err := publicsuffix.EffectiveTLDPlusOne(site.Hostname())
if err != nil {
return ""
}
return domain
}

func GetHostname(s string) string {
u, err := url.Parse(s)
if err != nil {
return ""
}
return u.Hostname()
}

func FixUrl(url, site string) string {
func FixUrl(url string, site *url.URL) string {
var newUrl string
if strings.HasPrefix(url, "//") {
// //google.com/example.php
newUrl = site.Scheme + ":" + url

if strings.HasPrefix(url, "http") {
} else if strings.HasPrefix(url, "http") {
// http://google.com || https://google.com
newUrl = url
} else if strings.HasPrefix(url, "//") {
// //google.com/example.php
newUrl = "https:" + url
} else if !strings.HasPrefix(url, "http") && len(url) > 0 {
if url[:1] == "/" {

} else if !strings.HasPrefix(url, "//") {
if strings.HasPrefix(url, "/") {
// Ex: /?thread=10
newUrl = site + url
newUrl = site.Scheme + "://" + site.Host + url

} else {
// Ex: ?thread=10
newUrl = site + "/" + url
if strings.HasPrefix(url, ".") {
if strings.HasPrefix(url, "..") {
newUrl = site.Scheme + "://" + site.Host + url[2:]
} else {
newUrl = site.Scheme + "://" + site.Host + url[1:]
}
} else {
newUrl = site.Scheme + "://" + site.Host + url
}
}
}
//Logger.Debugf("[Fix url] old: %s - new: %s", url, newUrl)
return newUrl
}

Expand Down Expand Up @@ -136,7 +134,12 @@ func DecodeChars(s string) string {
}

// In case json encoded chars
replacer := strings.NewReplacer(`\u002f`, "/", `\u0026`, "&")
replacer := strings.NewReplacer(
`\u002f`, "/",
`\U002F`, "/",
`\u0026`, "&",
`\U0026`, "&",
)
s = replacer.Replace(s)
return s
}
2 changes: 1 addition & 1 deletion core/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ import "testing"
func TestGetExtType(t *testing.T) {
url := "https://domain.com/data/avatars/m/123/12312312.jpg?1562846649"
t.Log(GetExtType(url))
}
}
21 changes: 8 additions & 13 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,8 @@ func run(cmd *cobra.Command, args []string) {
isDebug, _ := cmd.Flags().GetBool("debug")
if isDebug {
core.Logger.SetLevel(logrus.DebugLevel)
core.Logger.SetOutput(os.Stdout)
} else {
core.Logger.SetLevel(logrus.InfoLevel)
core.Logger.SetOutput(os.Stdout)
}

verbose, _ := cmd.Flags().GetBool("verbose")
Expand Down Expand Up @@ -125,48 +123,46 @@ func run(cmd *cobra.Command, args []string) {
robots, _ := cmd.Flags().GetBool("robots")
otherSource, _ := cmd.Flags().GetBool("other-source")
includeSubs, _ := cmd.Flags().GetBool("include-subs")
maxDepth, _ := cmd.Flags().GetInt("depth")

var wg sync.WaitGroup
inputChan := make(chan string, threads)
for i := 0; i < threads; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for site := range inputChan {
u, err := url.Parse(site)
for rawSite := range inputChan {
site, err := url.Parse(rawSite)
if err != nil {
logrus.Errorf("Failed to parse: %s", site)
logrus.Errorf("Failed to parse %s: %s", rawSite, err)
continue
}

var siteWg sync.WaitGroup

crawler := core.NewCrawler(site, cmd)
site = strings.TrimSuffix(u.String(), "/")

siteWg.Add(1)
go func() {
crawler.Start()
defer siteWg.Done()
}()

// Brute force Sitemap/Robots path
// Brute force Sitemap path
if sitemap {
siteWg.Add(1)
go core.ParseSiteMap(site, maxDepth, crawler.Output, crawler.C, &siteWg)
go core.ParseSiteMap(site, crawler.Output, crawler.C, &siteWg)
}

// Find Robots.txt
if robots {
siteWg.Add(1)
go core.ParseRobots(site, maxDepth, crawler.Output, crawler.C, &siteWg)
go core.ParseRobots(site, crawler.Output, crawler.C, &siteWg)
}

if otherSource {
siteWg.Add(1)
go func() {
defer siteWg.Done()
urls := core.OtherSources(core.GetHostname(site), includeSubs)
urls := core.OtherSources(site.Hostname(), includeSubs)
for _, url := range urls {
url = strings.TrimSpace(url)
if len(url) == 0 {
Expand All @@ -183,7 +179,6 @@ func run(cmd *cobra.Command, args []string) {
}
siteWg.Wait()
crawler.C.Wait()
//_ = crawler.Output.Close()
}
}()
}
Expand Down

0 comments on commit 008623d

Please sign in to comment.