From ce31a19ae3d5b2ce1cdd4f14426685ecaaee6b2c Mon Sep 17 00:00:00 2001 From: jakopako Date: Wed, 25 Dec 2024 12:13:23 +0100 Subject: [PATCH] allow configuring a default for the date components. Fixes #185 --- README.md | 14 ++-- scraper/scraper.go | 70 +++++++++--------- scraper/scraper_test.go | 160 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 200 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 5b65726..0c9f572 100644 --- a/README.md +++ b/README.md @@ -230,10 +230,9 @@ A dynamic field can have one of the following three types: `text`, `url` or `dat | components | | | X | `[]` | | date_language | | | X | `"de_DE"` | | date_location | | | X | `"UTC"` | -| default | X | X | | `""` | | guess_year | | | X | `false` | | hide | X | X | X | `false` | -| location | X | X | | `[]` | +| location | X | X | X (date components) | `[]` | | name | X | X | X | `""` | | on_subpage | X | X | X | `""` | | separator | X | | | `""` | @@ -296,10 +295,6 @@ The `date_language` needs to correspond to the language on the website. Note, th `date_location` sets the time zone of the respective date. -**`default`** - -If no value is found on the website the field's value defaults to this `default`. - **`guess_year`** If set to `false` and no date component is defined that covers the year, the year of the resulting date defaults to the current year. If set to `true` and no date component is defined that covers the year, goskyr will try to be 'smart' in guessing the year. This helps if a scraped list of dates covers more than one year and/or scraped dates are not within the current year but the next. Note that there are definitely some cases where this year guessing does not yet work. @@ -341,9 +336,10 @@ fields: regex_extract: exp: "[^•]*" index: 0 + ignore_errors: false # default is false ``` -This field is implicitly of type `text`. The `location` tells the scraper where to look for the field value and how to extract it. In this case the selector on its own would not be enough to extract the desired value as we would get something like this: `Bastian Baker • Konzert`. That's why there is an extra option to define a regular expression to extract a substring. Note that in this example our extracted string would still contain a trailing space which is automatically removed by the scraper. Let's have a look at a few more examples to have a better understanding of the location configuration. +This field is implicitly of type `text`. The `location` tells the scraper where to look for the field value and how to extract it. In this case the selector on its own would not be enough to extract the desired value as we would get something like this: `Bastian Baker • Konzert`. That's why there is an extra option to define a regular expression to extract a substring. Note that in this example our extracted string would still contain a trailing space which is automatically removed by the scraper. Moreover, if `ignore_errors` is set to true, the scraper would not skip the given field throwing an error but would return an empty string instead. Let's have a look at a few more examples to have a better understanding of the location configuration. _Subkey: `child_index`_ @@ -423,6 +419,10 @@ _Subkey: `json_selector`_ If the string extracted from the webpage is a json string, then you can extract data from that json based on the give `json_selector`. +_Subkey: `default`_ + +If no value is found with the given configuration of this `location` the value defaults to `default`. + **`name`** The name of the respective field. diff --git a/scraper/scraper.go b/scraper/scraper.go index 18e6eeb..f74df6b 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -86,6 +86,7 @@ func NewConfig(configPath string) (*Config, error) { type RegexConfig struct { RegexPattern string `yaml:"exp"` Index int `yaml:"index"` + IgnoreErrors bool `yaml:"ignore_errors"` } // ElementLocation is used to find a specific string in a html document @@ -99,6 +100,7 @@ type ElementLocation struct { EntireSubtree bool `yaml:"entire_subtree,omitempty"` AllNodes bool `yaml:"all_nodes,omitempty"` Separator string `yaml:"separator,omitempty"` + Default string `yaml:"default,omitempty"` } // TransformConfig is used to replace an existing substring with some other @@ -126,7 +128,6 @@ type Field struct { Value string `yaml:"value,omitempty"` Type string `yaml:"type,omitempty"` // can currently be text, url or date ElementLocations ElementLocations `yaml:"location,omitempty"` // elements are extracted strings joined using the given Separator - Default string `yaml:"default,omitempty"` // the default for a dynamic field (text or url) if no value is found Separator string `yaml:"separator,omitempty"` // If a field can be found on a subpage the following variable has to contain a field name of // a field of type 'url' that is located on the main page. @@ -608,13 +609,8 @@ func extractField(field *Field, event map[string]interface{}, s *goquery.Selecti } } t := strings.Join(parts, field.Separator) - if t == "" { - // if the extracted value is an empty string assign the default value - t = field.Default - if !field.CanBeEmpty && t == "" { - // if it's still empty and must not be empty return an error - return fmt.Errorf("field %s cannot be empty", field.Name) - } + if !field.CanBeEmpty && t == "" { + return fmt.Errorf("field %s cannot be empty", field.Name) } // transform the string if required for _, tr := range field.Transform { @@ -633,13 +629,8 @@ func extractField(field *Field, event map[string]interface{}, s *goquery.Selecti if err != nil { return err } - if url == "" { - // if the extracted value is an empty string assign the default value - url = field.Default - if !field.CanBeEmpty && url == "" { - // if it's still empty and must not be empty return an error - return fmt.Errorf("field %s cannot be empty", field.Name) - } + if !field.CanBeEmpty && url == "" { + return fmt.Errorf("field %s cannot be empty", field.Name) } event[field.Name] = url case "date": @@ -707,9 +698,10 @@ type datePart struct { layoutParts []string } +// not very nice because we only need this +// to be able to test deterministically type dateDefaults struct { - year int - time string // should be format 15:04 + year int // format: 2006 } func getDate(f *Field, s *goquery.Selection, dd dateDefaults) (time.Time, error) { @@ -757,11 +749,12 @@ func getDate(f *Field, s *goquery.Selection, dd dateDefaults) (time.Time, error) } // currently not all date parts have default values - if !combinedParts.Day || !combinedParts.Month { - return t, errors.New("date parsing error: to generate a date at least a day and a month is needed") + if !combinedParts.Day || !combinedParts.Month || !combinedParts.Time { + return t, errors.New("date parsing error: to generate a date at least a time, a day and a month are needed") } - // adding default values where necessary + // year is special in the sense that it defaults to the current year + // if nothing is specified if !combinedParts.Year { if dd.year == 0 { dd.year = time.Now().Year() @@ -771,15 +764,6 @@ func getDate(f *Field, s *goquery.Selection, dd dateDefaults) (time.Time, error) layoutParts: []string{"2006"}, }) } - if !combinedParts.Time { - if dd.time == "" { - dd.time = "20:00" - } - dateParts = append(dateParts, datePart{ - stringPart: dd.time, - layoutParts: []string{"15:04"}, - }) - } var dateTimeString string dateTimeLayouts := []string{""} @@ -987,7 +971,12 @@ func getTextString(e *ElementLocation, s *goquery.Selection) (string, error) { for i, f := range fieldStrings { fieldStrings[i] = utils.ShortenString(f, e.MaxLength) } - return strings.Join(fieldStrings, e.Separator), nil + finalString := strings.Join(fieldStrings, e.Separator) + + if finalString == "" && e.Default != "" { + return e.Default, nil + } + return finalString, nil } func extractStringRegex(rc *RegexConfig, s string) (string, error) { @@ -998,18 +987,25 @@ func extractStringRegex(rc *RegexConfig, s string) (string, error) { return "", err } matchingStrings := regex.FindAllString(s, -1) + errMsg := "" if len(matchingStrings) == 0 { - msg := fmt.Sprintf("no matching strings found for regex: %s", rc.RegexPattern) - return "", errors.New(msg) - } - if rc.Index == -1 { + errMsg = fmt.Sprintf("no matching strings found for regex: %s", rc.RegexPattern) + } else if rc.Index == -1 { extractedString = matchingStrings[len(matchingStrings)-1] } else { if rc.Index >= len(matchingStrings) { - msg := fmt.Sprintf("regex index out of bounds. regex '%s' gave only %d matches", rc.RegexPattern, len(matchingStrings)) - return "", errors.New(msg) + errMsg = fmt.Sprintf("regex index out of bounds. regex '%s' gave only %d matches", rc.RegexPattern, len(matchingStrings)) + } else { + extractedString = matchingStrings[rc.Index] + } + } + if errMsg != "" { + if rc.IgnoreErrors { + slog.Info(fmt.Sprintf("ignoring regex error: %s", errMsg)) + return "", nil + } else { + return "", errors.New(errMsg) } - extractedString = matchingStrings[rc.Index] } } return extractedString, nil diff --git a/scraper/scraper_test.go b/scraper/scraper_test.go index 5552332..b48ba29 100644 --- a/scraper/scraper_test.go +++ b/scraper/scraper_test.go @@ -91,6 +91,13 @@ const ( 29.02.Heinz Rudolf Kunze & Verstärkung – ABGESAGT ` + htmlString7 = ` +

+ + 20.02.Heinz Rudolf Kunze & Verstärkung + – ABGESAGT +

` ) func TestFilterItemMatchTrue(t *testing.T) { @@ -636,6 +643,17 @@ func TestExtractFieldDate29Feb(t *testing.T) { "02.01.", }, }, + { + Covers: date.CoveredDateParts{ + Time: true, + }, + ElementLocation: ElementLocation{ + Default: "19:30", + }, + Layout: []string{ + "15:04", + }, + }, }, DateLocation: "Europe/Berlin", GuessYear: true, @@ -841,3 +859,145 @@ func TestGuessYearStartBeforeReference(t *testing.T) { } } } + +func TestDefaultTextValue(t *testing.T) { + d := "default text" + doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlString5)) + if err != nil { + t.Fatalf("unexpected error while reading html string: %v", err) + } + l := &ElementLocation{ + Selector: ".non-existent", + Default: d, + } + v, err := getTextString(l, doc.Selection) + if err != nil { + t.Fatalf("unexpected error while extracting the element: %v", err) + } + if v != d { + t.Fatalf("expected '%s' but got '%s'", d, v) + } +} + +func TestDefaultTextValueExistentValue(t *testing.T) { + d := "default text" + e := "Treffpunkt" + doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlString4)) + if err != nil { + t.Fatalf("unexpected error while reading html string: %v", err) + } + l := &ElementLocation{ + Selector: "div > a > div", + Default: d, + } + v, err := getTextString(l, doc.Selection) + if err != nil { + t.Fatalf("unexpected error while extracting the element: %v", err) + } + if v != e { + t.Fatalf("expected '%s' but got '%s'", e, v) + } +} + +func TestDefaultValueDateComponentNonExistent(t *testing.T) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlString7)) + if err != nil { + t.Fatalf("unexpected error while reading html string: %v", err) + } + f := &Field{ + Name: "date", + Type: "date", + Components: []DateComponent{ + { + Covers: date.CoveredDateParts{ + Day: true, + Month: true, + }, + ElementLocation: ElementLocation{ + Selector: "h2 > a > span", + }, + Layout: []string{ + "02.01.", + }, + }, + { + Covers: date.CoveredDateParts{ + Time: true, + }, + ElementLocation: ElementLocation{ + Selector: ".non-existent", + Default: "19:30", + }, + Layout: []string{ + "15:04", + }, + }, + }, + DateLocation: "Europe/Berlin", + GuessYear: true, + } + dt, err := getDate(f, doc.Selection, dateDefaults{}) + if err != nil { + t.Fatalf("unexpected error while extracting the date field: %v", err) + } + if dt.Hour() != 19 { + t.Fatalf("expected hour to be %d but got %d", 19, dt.Hour()) + } + if dt.Minute() != 30 { + t.Fatalf("expected minute to be %d but got %d", 30, dt.Minute()) + } +} + +func TestDefaultValueDateComponentRegexExtractError(t *testing.T) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlString7)) + if err != nil { + t.Fatalf("unexpected error while reading html string: %v", err) + } + f := &Field{ + Name: "date", + Type: "date", + Components: []DateComponent{ + { + Covers: date.CoveredDateParts{ + Day: true, + Month: true, + }, + ElementLocation: ElementLocation{ + Selector: "h2 > a > span", + Default: "1. April", + RegexExtract: RegexConfig{ + RegexPattern: "[A-Z]{20}", // non-matching regex + IgnoreErrors: true, // will make sure the selector returns an empty string in case of an error in which case we default to the given default + }, + }, + Layout: []string{ + "2. January", + }, + }, + { + Covers: date.CoveredDateParts{ + Time: true, + }, + ElementLocation: ElementLocation{ + Selector: ".non-existent", + Default: "19:30", + }, + Layout: []string{ + "15:04", + }, + }, + }, + DateLocation: "Europe/Berlin", + GuessYear: true, + } + dt, err := getDate(f, doc.Selection, dateDefaults{}) + if err != nil { + t.Fatalf("unexpected error while extracting the date field: %v", err) + } + if dt.Day() != 1 { + t.Fatalf("expected day to be %d but got %d", 1, dt.Day()) + } + if dt.Month() != 4 { + t.Fatalf("expected month to be %d but got %d", 4, dt.Month()) + } +}