This repository has been archived by the owner on Jul 29, 2020. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lessonscraper.go
158 lines (133 loc) · 4.53 KB
/
lessonscraper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
package insidescraper
import (
"fmt"
"math/rand"
"strconv"
"strings"
"unicode"
"github.com/PuerkitoBio/goquery"
)
// LessonScraper gets data about lessons from a column.
type LessonScraper struct {
Row *goquery.Selection
Lesson *Lesson
}
// LoadLesson scrapes the row and returns a structured lesson.
func (scraper *LessonScraper) LoadLesson() {
title := scraper.Row.ChildrenFiltered("td:first-child").Text()
scraper.Lesson = &Lesson{
SiteData: &SiteData{
Title: strings.TrimSpace(title),
},
ID: strconv.Itoa(rand.Int()),
}
scraper.loadMediaSources()
scraper.loadMediaDescription()
}
// Creates the media objects, sets source, and title if available.
func (scraper *LessonScraper) loadMediaSources() {
mediaParent := scraper.Row.ChildrenFiltered("td:nth-child(2)")
// The media item which is being formed.
newMedia := &Media{SiteData: &SiteData{}}
mediaParent.Contents().Each(func(_ int, s *goquery.Selection) {
switch goquery.NodeName(s) {
case "br":
// Line break marks the end of a media item.
// Reset newMedia for the next media
newMedia = &Media{SiteData: &SiteData{}}
case "a":
if mp3Source, exists := s.Attr("mp3"); exists {
newMedia.Source = mp3Source
scraper.Lesson.Audio = append(scraper.Lesson.Audio, *newMedia)
newMedia = &scraper.Lesson.Audio[len(scraper.Lesson.Audio)-1]
} else if pdfSource, exists := s.Attr("href"); exists {
scraper.Lesson.Pdf = append(scraper.Lesson.Pdf, pdfSource)
} else {
fmt.Println("Error: No source was found.")
}
case "#text":
newMedia.Title = getSanatizedTitle(s.Text())
}
})
}
// Loads description of lesson, and of media, if available.
func (scraper *LessonScraper) loadMediaDescription() {
mediaParent := scraper.Row.ChildrenFiltered("td:nth-child(3)")
rawDescription := strings.TrimSpace(mediaParent.Text())
descriptionParts := strings.Split(rawDescription, "\n")
var activeAudio *Media
for _, part := range descriptionParts {
// If the current part is a title, the next parts set the description of the matching audio.
// If no matching audio has been found yet, they set the lesson description.
possibleTitle := getSanatizedTitle(part)
if matchingAudio := getMediaWithTitle(scraper.Lesson.Audio, possibleTitle); matchingAudio != nil {
activeAudio = matchingAudio
} else if activeAudio != nil {
activeAudio.Description = separate(activeAudio.Description, part)
} else {
scraper.Lesson.Description = separate(scraper.Lesson.Description, part)
}
}
for i, audio := range scraper.Lesson.Audio {
scraper.Lesson.Audio[i].Description = strings.TrimSpace(audio.Description)
}
scraper.Lesson.Title = strings.TrimSpace(scraper.Lesson.Title)
}
// Separate separates its two arguments with a new line.
func separate(data1, data2 string) string {
if data1 == "" {
return strings.TrimSpace(data2)
}
return strings.TrimSpace(data1 + "\n" + data2)
}
// Get title, without the extra bits.
func getSanatizedTitle(title string) string {
title = strings.Replace(title, "PDF", "", 1)
title = strings.Replace(title, "MP3", "", 1)
title = strings.TrimFunc(title, func(r rune) bool {
return unicode.IsSpace(r) || r == '-' || r == '.' || r == ':'
})
return title
}
// Returns pointer to media in the slice with the given title, or nil.
func getMediaWithTitle(mediaSlice []Media, possibleTitle string) *Media {
for i, value := range mediaSlice {
if value.Title == possibleTitle {
return &mediaSlice[i]
}
// For cases where the description title has two parts, e.g Class One/ Five (המשך).
// See https://insidechassidus.org/maamarim/maamarim-of-the-rebbe/text-based-concise-summary/1553-maamarim-5715
splitPossibleTitle := getSplit(possibleTitle)
splitActualTitle := getSplit(value.Title)
// If both titles are composites, then they would already be equal above.
if len(splitPossibleTitle) > 1 && len(splitActualTitle) > 1 {
continue
}
// Get the composite and normal titles.
split := splitPossibleTitle
normalTitle := value.Title
if len(split) == 1 {
split = splitActualTitle
normalTitle = possibleTitle
}
if len(split) == 2 && (split[0] == normalTitle || split[1] == normalTitle) {
// Use the longer title.
if len(possibleTitle) > len(mediaSlice[i].Title) {
mediaSlice[i].Title = possibleTitle
}
return &mediaSlice[i]
}
}
return nil
}
func getSplit(title string) []string {
split := strings.Split(title, "/")
if len(split) != 2 {
split = strings.Split(title, ",")
}
if len(split) == 2 {
split[0] = getSanatizedTitle(split[0])
split[1] = getSanatizedTitle(split[1])
}
return split
}