Skip to content

Commit

Permalink
[scraper]: Add support for deducing resource name from file path
Browse files Browse the repository at this point in the history
if scraper cannot extract it from the markdown file.

Signed-off-by: Alper Rifat Ulucinar <[email protected]>
  • Loading branch information
ulucinar committed Mar 9, 2023
1 parent 5377e5d commit dc98682
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 2 deletions.
2 changes: 2 additions & 0 deletions cmd/scraper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ func main() {
app = kingpin.New(filepath.Base(os.Args[0]), "Terraform Registry provider metadata scraper.").DefaultEnvars()
outFile = app.Flag("out", "Provider metadata output file path").Short('o').Default("provider-metadata.yaml").OpenFile(os.O_CREATE, 0644)
providerName = app.Flag("name", "Provider name").Short('n').Required().String()
resourcePrefix = app.Flag("resource-prefix", `Terraform resource name prefix for the Terraform provider. For example, this is "google" for the google Terraform provider.`).String()
codeXPath = app.Flag("code-xpath", "Code XPath expression").Default(`//code[@class="language-terraform" or @class="language-hcl"]/text()`).String()
preludeXPath = app.Flag("prelude-xpath", "Prelude XPath expression").Default(`//text()[contains(., "description") and contains(., "page_title")]`).String()
fieldXPath = app.Flag("field-xpath", "Field documentation XPath expression").Default(`//ul/li//code[1]/text()`).String()
Expand All @@ -37,6 +38,7 @@ func main() {
FieldDocXPath: *fieldXPath,
ImportXPath: *importXPath,
FileExtensions: *fileExtensions,
ResourcePrefix: *resourcePrefix,
}), "Failed to scrape Terraform provider metadata")
kingpin.FatalIfError(pm.Store((*outFile).Name()), "Failed to store Terraform provider metadata to file: %s", (*outFile).Name())
}
21 changes: 19 additions & 2 deletions pkg/registry/meta.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,19 @@ func (r *Resource) addExampleManifest(file *hcl.File, body *hclsyntax.Block) err
return nil
}

func (r *Resource) scrapeExamples(doc *html.Node, codeElXPath string, debug bool) error { // nolint: gocyclo
func getResourceNameFromPath(path, resourcePrefix string) string {
tokens := strings.Split(filepath.Base(path), ".")
if len(tokens) < 2 {
return ""
}
prefix := ""
if len(resourcePrefix) != 0 {
prefix = resourcePrefix + "_"
}
return fmt.Sprintf("%s%s", prefix, tokens[0])
}

func (r *Resource) scrapeExamples(doc *html.Node, codeElXPath string, path string, resourcePrefix string, debug bool) error { // nolint: gocyclo
resourceName := r.Title
nodes := htmlquery.Find(doc, codeElXPath)
for _, n := range nodes {
Expand All @@ -83,6 +95,9 @@ func (r *Resource) scrapeExamples(doc *html.Node, codeElXPath string, debug bool
}
body.Blocks = trimmed
// first try an exact match to find the example
if len(resourceName) == 0 {
resourceName = getResourceNameFromPath(path, resourcePrefix)
}
if err := r.findExampleBlock(f, body.Blocks, &resourceName, true); err != nil {
return err
}
Expand Down Expand Up @@ -441,7 +456,7 @@ func (r *Resource) scrape(path string, config *ScrapeConfiguration) error {
r.scrapeFieldDocs(doc, config.FieldDocXPath)
r.scrapeImportStatements(doc, config.ImportXPath)

return r.scrapeExamples(doc, config.CodeXPath, config.Debug)
return r.scrapeExamples(doc, config.CodeXPath, path, config.ResourcePrefix, config.Debug)
}

// ScrapeConfiguration is a configurator for the scraper
Expand All @@ -460,6 +475,8 @@ type ScrapeConfiguration struct {
ImportXPath string
// FileExtensions extensions of the files to be scraped
FileExtensions []string
// ResourcePrefix Terraform resource name prefix for the Terraform provider
ResourcePrefix string
}

func (sc *ScrapeConfiguration) hasExpectedExtension(fileName string) bool {
Expand Down

0 comments on commit dc98682

Please sign in to comment.