-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
142 lines (112 loc) · 3.68 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
package main
import (
"encoding/csv"
"fmt"
"log"
"os"
"strconv"
"strings"
"github.com/gocolly/colly"
)
func main() {
homeURL := "https://www.orami.co.id"
fName := "oramiproducts.csv"
file, err := os.Create(fName)
if err != nil {
log.Fatalf("Cannot create file %q: %s\n", fName, err)
return
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
// Write CSV header
writer.Write([]string{"Category", "Name", "Price", "Disc-Price"})
// Instantiate main collector
c := colly.NewCollector(
colly.AllowedDomains("www.orami.co.id"),
colly.CacheDir("./orami_cache"),
)
c.OnHTML(`.container .oss-u-1-8 .mb-24 a`, func(e *colly.HTMLElement) {
catHref := e.Attr("href")
log.Println("Page category found container ", catHref)
// products by category url is identified by /c/
if strings.Index(catHref, "/c/") != -1 {
catURL := homeURL + catHref
log.Println(" url category found ", catURL)
extractProductsPerCategory(catURL, c, writer)
}
})
c.OnRequest(func(r *colly.Request) {
log.Println("visiting ", r.URL.String())
})
log.Println("going to visit ", homeURL)
c.Visit(homeURL)
}
func extractProductsPerCategory(homeProductsURL string, c *colly.Collector, writer *csv.Writer) {
// Create another collector to scrape product information details in category pages
detailCollector := c.Clone()
detailCollectorPages := c.Clone()
// for debugging
detailCollector.OnRequest(func(r *colly.Request) {
log.Println("visiting product ", r.URL.String())
})
detailCollectorPages.OnRequest(func(r *colly.Request) {
log.Println("visiting product pages ", r.URL.String())
})
// Extract details of the course
detailCollector.OnHTML(`div[class=wrap-widget-detail]`, extractProductsFunc(writer))
detailCollectorPages.OnHTML(`div[class=wrap-widget-detail]`, extractProductsFunc(writer))
var lastPage int
detailCollector.OnHTML(`ul[class=pagination]`, func(e *colly.HTMLElement) {
log.Println("Paging found", e.Request.URL)
var lastPageURL string
e.ForEach("li", func(_ int, el *colly.HTMLElement) {
lastPageURL = el.ChildAttr("a", "href")
})
log.Println("lastPage URL found", lastPageURL)
lastPage, _ = strconv.Atoi(string([]rune(lastPageURL)[(strings.Index(lastPageURL, "?page=") + 6):]))
log.Println("lastPage found", lastPage)
})
homeProductsURLPOne := homeProductsURL + "?page=1"
log.Println(homeProductsURLPOne)
detailCollector.Visit(fmt.Sprintf(homeProductsURLPOne))
for i := 2; i <= lastPage; i++ {
detailCollectorPages.Visit(fmt.Sprintf((homeProductsURL + "?page=%d"), i))
}
}
func extractProductsFunc(writer *csv.Writer) func(*colly.HTMLElement) {
return func(e *colly.HTMLElement) {
log.Println("Product found", e.Request.URL)
cat := e.ChildText(".prod-cat > label > a")
if cat == "" {
log.Println("No title found", e.Request.URL)
}
name := e.ChildText(".prod-name > a")
log.Println("name ", name)
if name == "" {
log.Println("No name found", e.Request.URL)
}
discPrice := e.ChildText(".widget-price .onsale .disc-price")
if discPrice != "" {
discount := e.ChildText(".widget-price .onsale .disc-price .wrap-badges-widget")
discPrice = strings.Replace(discPrice, discount, "", 1)
}
discPrice = strings.TrimSpace(strings.Replace(discPrice, "Rp", "", 1))
price := e.ChildText(".widget-price .onsale .normal-price")
if price == "" {
price = e.ChildText(".widget-price > p")
}
price = strings.Replace(price, "Mulai", "", 1)
price = strings.TrimSpace(strings.Replace(price, "Rp", "", 1))
if price == "" {
// if it happens, then something wrong / web layout has changed
log.Println("No price found", e.Request.URL)
}
writer.Write([]string{
cat,
name,
price,
discPrice,
})
}
}