From 56fd8081b46e811e2e58ee553c84669cdce8cdad Mon Sep 17 00:00:00 2001 From: thyagopereira Date: Mon, 26 Apr 2021 18:35:42 -0300 Subject: [PATCH 1/5] =?UTF-8?q?Recuperando=20id=20de=20sess=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler.go | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/crawler.go b/crawler.go index 7d58936..45dbf6a 100644 --- a/crawler.go +++ b/crawler.go @@ -1,6 +1,50 @@ package main +import ( + "fmt" + "io/ioutil" + "net/http" + "os" + "strings" + + "github.com/dadosjusbr/coletores/status" +) + +// Inicializa um mapa com o formato da url para cada tipo de planilha +func initComplements(month, year int) map[string]string { + return map[string]string{ + "remu": fmt.Sprint("https://servicos-portal.mpro.mp.br/plcVis/frameset?__report=..%2FROOT%2Frel%2Fcontracheque%2Fmembros%2FremuneracaoMembrosAtivos.rptdesign&anomes=", year, fmt.Sprintf("%02d", month), "&nome=&cargo=&lotacao="), + } +} + +// Inicializa o id de sessão para uma dada url +func seasonId(url string) string { + + resp, err := http.Get(url) + if err != nil { + status.ExitFromError(status.NewError(status.ConnectionError, fmt.Errorf("Was not possible to get a season id to the url: %s. %q", url, err))) + os.Exit(1) + } + defer resp.Body.Close() + + page, err := ioutil.ReadAll(resp.Body) + htmlCode := string(page) + + id := strings.Split(htmlCode, "Constants.viewingSessionId = \"") + seasonId := id[1][0:19] + + return seasonId +} + func Crawl(month int, year int, outputPath string) []string { var paths []string + complements := initComplements(month, year) + + for key, complement := range complements { + var fileName = "file" + key + ".xls" + seasonId := seasonId(complement) + fmt.Println(fileName, seasonId) + } + return paths } From 489114461e1fa8685b3b70e7406a4b9c653bf9df Mon Sep 17 00:00:00 2001 From: thyagopereira Date: Tue, 27 Apr 2021 15:30:10 -0300 Subject: [PATCH 2/5] Coleta e escrita de dados --- crawler.go | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/crawler.go b/crawler.go index 45dbf6a..9af6f75 100644 --- a/crawler.go +++ b/crawler.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "io" "io/ioutil" "net/http" "os" @@ -10,7 +11,11 @@ import ( "github.com/dadosjusbr/coletores/status" ) -// Inicializa um mapa com o formato da url para cada tipo de planilha +var urlFormats = map[string]string{ + "remu": "&__sessionId=%s&__format=xls&__asattachment=true&__overwrite=false", +} + +// Inicializa um mapa com o formato da url complementar para cada tipo de planilha func initComplements(month, year int) map[string]string { return map[string]string{ "remu": fmt.Sprint("https://servicos-portal.mpro.mp.br/plcVis/frameset?__report=..%2FROOT%2Frel%2Fcontracheque%2Fmembros%2FremuneracaoMembrosAtivos.rptdesign&anomes=", year, fmt.Sprintf("%02d", month), "&nome=&cargo=&lotacao="), @@ -36,14 +41,38 @@ func seasonId(url string) string { return seasonId } +func download(url string, filePath string) { + + resp, err := http.Get(url) + if err != nil { + status.ExitFromError(status.NewError(status.DataUnavailable, fmt.Errorf("Não foi possível fazer o download do arquivo: %s .O seguinte erro foi gerado: %q", filePath, err))) + os.Exit(1) + } + + file, err := os.Create(filePath) + if err != nil { + status.ExitFromError(status.NewError(status.DataUnavailable, fmt.Errorf("Não foi possível fazer o download do arquivo: %s .O seguinte erro foi gerado: %q", filePath, err))) + os.Exit(1) + } + defer file.Close() + + io.Copy(file, resp.Body) + defer resp.Body.Close() +} + func Crawl(month int, year int, outputPath string) []string { var paths []string complements := initComplements(month, year) - for key, complement := range complements { - var fileName = "file" + key + ".xls" - seasonId := seasonId(complement) - fmt.Println(fileName, seasonId) + for key, _ := range complements { + var fileName = fmt.Sprint(year, "_", fmt.Sprintf("%02d", month), "_", key) + var filePath = fmt.Sprint(outputPath, "/", fileName, ".xls") + + seasonId := seasonId(complements[key]) + url := fmt.Sprint(complements[key], fmt.Sprintf(urlFormats[key], seasonId)) + + download(url, filePath) + paths = append(paths, filePath) } return paths From a9a2dfa196e367117aec44cfa4d5eb2577dc7d21 Mon Sep 17 00:00:00 2001 From: thyagopereira Date: Wed, 28 Apr 2021 17:30:47 -0300 Subject: [PATCH 3/5] Refatoramento e proposta de design --- crawler.go | 65 +++++++++++++++++++++++++++++------------------------- main.go | 6 ++++- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/crawler.go b/crawler.go index 9af6f75..0b06dce 100644 --- a/crawler.go +++ b/crawler.go @@ -11,69 +11,74 @@ import ( "github.com/dadosjusbr/coletores/status" ) -var urlFormats = map[string]string{ - "remu": "&__sessionId=%s&__format=xls&__asattachment=true&__overwrite=false", +type urlRequest struct { + remuDownloadURL string } -// Inicializa um mapa com o formato da url complementar para cada tipo de planilha -func initComplements(month, year int) map[string]string { - return map[string]string{ - "remu": fmt.Sprint("https://servicos-portal.mpro.mp.br/plcVis/frameset?__report=..%2FROOT%2Frel%2Fcontracheque%2Fmembros%2FremuneracaoMembrosAtivos.rptdesign&anomes=", year, fmt.Sprintf("%02d", month), "&nome=&cargo=&lotacao="), +// Retorna as url para download de cada planilha em questão +func initRequests(month, year int) (urlRequest, error) { + + idURL := fmt.Sprint("https://servicos-portal.mpro.mp.br/plcVis/frameset?__report=..%2FROOT%2Frel%2Fcontracheque%2Fmembros%2FremuneracaoMembrosAtivos.rptdesign&anomes=", year, fmt.Sprintf("%02d", month), "&nome=&cargo=&lotacao=") + sessionId, err := seasonId(idURL) + if err != nil { + return urlRequest{}, err } + + downloadURL := fmt.Sprint(idURL, fmt.Sprintf("&__sessionId=%s&__format=xls&__asattachment=true&__overwrite=false", sessionId)) + return urlRequest{downloadURL}, nil + } // Inicializa o id de sessão para uma dada url -func seasonId(url string) string { - +func seasonId(url string) (string, error) { resp, err := http.Get(url) if err != nil { - status.ExitFromError(status.NewError(status.ConnectionError, fmt.Errorf("Was not possible to get a season id to the url: %s. %q", url, err))) - os.Exit(1) + return "", status.NewError(status.ConnectionError, fmt.Errorf("Was not possible to get a season id to the url: %s. %q", url, err)) } defer resp.Body.Close() page, err := ioutil.ReadAll(resp.Body) - htmlCode := string(page) + if err != nil { + return "", status.NewError(status.ConnectionError, fmt.Errorf("Was not possible to get a season id to the url: %s. %q", url, err)) + } + htmlCode := string(page) id := strings.Split(htmlCode, "Constants.viewingSessionId = \"") seasonId := id[1][0:19] - return seasonId + return seasonId, err } -func download(url string, filePath string) { - +func download(url string, filePath string) error { resp, err := http.Get(url) if err != nil { - status.ExitFromError(status.NewError(status.DataUnavailable, fmt.Errorf("Não foi possível fazer o download do arquivo: %s .O seguinte erro foi gerado: %q", filePath, err))) - os.Exit(1) + return status.NewError(status.DataUnavailable, fmt.Errorf("Was not possible download the file: %s .The following mistake was taken: %q", filePath, err)) } + defer resp.Body.Close() file, err := os.Create(filePath) if err != nil { - status.ExitFromError(status.NewError(status.DataUnavailable, fmt.Errorf("Não foi possível fazer o download do arquivo: %s .O seguinte erro foi gerado: %q", filePath, err))) - os.Exit(1) + return status.NewError(status.DataUnavailable, fmt.Errorf("Was not possible download the file: %s .The following mistake was taken: %q", filePath, err)) } defer file.Close() io.Copy(file, resp.Body) - defer resp.Body.Close() + return nil } -func Crawl(month int, year int, outputPath string) []string { +func Crawl(month int, year int, outputPath string) ([]string, error) { var paths []string - complements := initComplements(month, year) - for key, _ := range complements { - var fileName = fmt.Sprint(year, "_", fmt.Sprintf("%02d", month), "_", key) - var filePath = fmt.Sprint(outputPath, "/", fileName, ".xls") + var fileName = fmt.Sprint(year, "_", fmt.Sprintf("%02d", month), "_remu") + var filePath = fmt.Sprint(outputPath, "/", fileName, ".xls") - seasonId := seasonId(complements[key]) - url := fmt.Sprint(complements[key], fmt.Sprintf(urlFormats[key], seasonId)) - - download(url, filePath) - paths = append(paths, filePath) + request, err := initRequests(year, month) + if err != nil { + return paths, err } - return paths + download(request.remuDownloadURL, filePath) + paths = append(paths, filePath) + + return paths, nil } diff --git a/main.go b/main.go index d5e8087..56a4d70 100644 --- a/main.go +++ b/main.go @@ -46,7 +46,11 @@ func main() { } // Main execution - fileNames := Crawl(month, year, outputPath) + fileNames, err := Crawl(month, year, outputPath) + if err != nil { + status.ExitFromError(err) + os.Exit(1) + } employees := Parse(month, year, fileNames) cr := coletores.ExecutionResult{ From 1bcbee8e66aa163e13f6766f774f67f9d9e6c587 Mon Sep 17 00:00:00 2001 From: thyagopereira Date: Thu, 29 Apr 2021 16:59:11 -0300 Subject: [PATCH 4/5] =?UTF-8?q?Download=20de=20verbas=20indenizat=C3=B3ria?= =?UTF-8?q?s=20adicionado?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler.go | 73 ++++++++++++++++++++++++++++++++++++++---------------- main.go | 2 -- 2 files changed, 52 insertions(+), 23 deletions(-) diff --git a/crawler.go b/crawler.go index 0b06dce..d913b15 100644 --- a/crawler.go +++ b/crawler.go @@ -11,26 +11,37 @@ import ( "github.com/dadosjusbr/coletores/status" ) -type urlRequest struct { +const ( + viURLType int = 0 + remuURLType int = 1 +) + +type urlRequests struct { remuDownloadURL string + viDownloadURL string } // Retorna as url para download de cada planilha em questão -func initRequests(month, year int) (urlRequest, error) { - - idURL := fmt.Sprint("https://servicos-portal.mpro.mp.br/plcVis/frameset?__report=..%2FROOT%2Frel%2Fcontracheque%2Fmembros%2FremuneracaoMembrosAtivos.rptdesign&anomes=", year, fmt.Sprintf("%02d", month), "&nome=&cargo=&lotacao=") - sessionId, err := seasonId(idURL) +func requestURL(year, month int) (urlRequests, error) { + remuIDURL := fmt.Sprint("https://servicos-portal.mpro.mp.br/plcVis/frameset?__report=..%2FROOT%2Frel%2Fcontracheque%2Fmembros%2FremuneracaoMembrosAtivos.rptdesign&anomes=", year, fmt.Sprintf("%02d", month), "&nome=&cargo=&lotacao=") + remuSessionID, err := seasonID(remuIDURL) if err != nil { - return urlRequest{}, err + return urlRequests{}, err } + remuDownloadURL := fmt.Sprint(remuIDURL, fmt.Sprintf("&__sessionId=%s&__format=xls&__asattachment=true&__overwrite=false", remuSessionID)) - downloadURL := fmt.Sprint(idURL, fmt.Sprintf("&__sessionId=%s&__format=xls&__asattachment=true&__overwrite=false", sessionId)) - return urlRequest{downloadURL}, nil + viIDURL := fmt.Sprint("https://servicos-portal.mpro.mp.br/plcVis/frameset?__report=..%2FROOT%2Frel%2Fcontracheque%2Fmembros%2FverbasIndenizatoriasMembrosAtivos.rptdesign&anomes=", year, fmt.Sprintf("%02d", month)) + viSessionID, err := seasonID(viIDURL) + if err != nil { + return urlRequests{}, err + } + viDownloadURL := fmt.Sprint(viIDURL, fmt.Sprint("&__sessionId=%s&__format=xls&__asattachment=true&__overwrite=false", viSessionID)) + return urlRequests{remuDownloadURL, viDownloadURL}, nil } // Inicializa o id de sessão para uma dada url -func seasonId(url string) (string, error) { +func seasonID(url string) (string, error) { resp, err := http.Get(url) if err != nil { return "", status.NewError(status.ConnectionError, fmt.Errorf("Was not possible to get a season id to the url: %s. %q", url, err)) @@ -42,8 +53,7 @@ func seasonId(url string) (string, error) { return "", status.NewError(status.ConnectionError, fmt.Errorf("Was not possible to get a season id to the url: %s. %q", url, err)) } - htmlCode := string(page) - id := strings.Split(htmlCode, "Constants.viewingSessionId = \"") + id := strings.Split(string(page), "Constants.viewingSessionId = \"") seasonId := id[1][0:19] return seasonId, err @@ -52,33 +62,54 @@ func seasonId(url string) (string, error) { func download(url string, filePath string) error { resp, err := http.Get(url) if err != nil { - return status.NewError(status.DataUnavailable, fmt.Errorf("Was not possible download the file: %s .The following mistake was taken: %q", filePath, err)) + return status.NewError(status.ConnectionError, fmt.Errorf("Problem doing GET on the URL(%s) to download the file(%s). Error: %q", url, filePath, err)) } defer resp.Body.Close() file, err := os.Create(filePath) if err != nil { - return status.NewError(status.DataUnavailable, fmt.Errorf("Was not possible download the file: %s .The following mistake was taken: %q", filePath, err)) + return status.NewError(status.DataUnavailable, fmt.Errorf("Error creating downloaded (%s) file(%s). Error: %q", url, filePath, err)) } defer file.Close() - io.Copy(file, resp.Body) + _, erro := io.Copy(file, resp.Body) + if erro != nil { + return status.NewError(status.SystemError, fmt.Errorf("Was not possible to save the downloaded file: %s. The following mistake was teken: %q", filePath, erro)) + } return nil } func Crawl(month int, year int, outputPath string) ([]string, error) { var paths []string - var fileName = fmt.Sprint(year, "_", fmt.Sprintf("%02d", month), "_remu") - var filePath = fmt.Sprint(outputPath, "/", fileName, ".xls") - - request, err := initRequests(year, month) + request, err := requestURL(year, month) if err != nil { return paths, err } - download(request.remuDownloadURL, filePath) - paths = append(paths, filePath) - + for typ := 0; typ < 2; typ++ { + switch typ { + case remuURLType: + var fileName = fmt.Sprint("%d", "_", "%02d", "_remu", year, month) + var filePath = fmt.Sprint(fileName, ".xls") + + err = download(request.remuDownloadURL, filePath) + if err != nil { + return paths, err + } + + paths = append(paths, filePath) + case viURLType: + var fileName = fmt.Sprintf("%d", "_", "%02d", "_vi", year, month) + var filePath = fmt.Sprintf(fileName, ".xls") + + err = download(request.viDownloadURL, filePath) + if err != nil { + return paths, err + } + + paths = append(paths, filePath) + } + } return paths, nil } diff --git a/main.go b/main.go index 56a4d70..939de95 100644 --- a/main.go +++ b/main.go @@ -48,7 +48,6 @@ func main() { // Main execution fileNames, err := Crawl(month, year, outputPath) if err != nil { - status.ExitFromError(err) os.Exit(1) } employees := Parse(month, year, fileNames) @@ -71,7 +70,6 @@ func main() { result, err := json.MarshalIndent(cr, "", " ") if err != nil { status.ExitFromError(status.NewError(status.SystemError, fmt.Errorf("JSON marshiling error: %q", err))) - os.Exit(1) } fmt.Println(string(result)) } From 083f0b4c82338e0d5412b19365875867dc1f3a22 Mon Sep 17 00:00:00 2001 From: thyagopereira Date: Mon, 3 May 2021 16:56:46 -0300 Subject: [PATCH 5/5] =?UTF-8?q?Refatoramento,=20adicionando=20download=20n?= =?UTF-8?q?o=20diret=C3=B3rio=20correto?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler.go | 57 ++++++++++++++++++++++++++++++------------------------ main.go | 2 +- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/crawler.go b/crawler.go index d913b15..058d7bd 100644 --- a/crawler.go +++ b/crawler.go @@ -16,32 +16,32 @@ const ( remuURLType int = 1 ) -type urlRequests struct { - remuDownloadURL string - viDownloadURL string +type requestURLs struct { + remunerationURL string + benefitsURL string } // Retorna as url para download de cada planilha em questão -func requestURL(year, month int) (urlRequests, error) { +func getRequestURLs(year, month int) (requestURLs, error) { remuIDURL := fmt.Sprint("https://servicos-portal.mpro.mp.br/plcVis/frameset?__report=..%2FROOT%2Frel%2Fcontracheque%2Fmembros%2FremuneracaoMembrosAtivos.rptdesign&anomes=", year, fmt.Sprintf("%02d", month), "&nome=&cargo=&lotacao=") - remuSessionID, err := seasonID(remuIDURL) + remuSessionID, err := getSessionID(remuIDURL) if err != nil { - return urlRequests{}, err + return requestURLs{}, err } - remuDownloadURL := fmt.Sprint(remuIDURL, fmt.Sprintf("&__sessionId=%s&__format=xls&__asattachment=true&__overwrite=false", remuSessionID)) + remuDownloadURL := fmt.Sprintf("%s&__sessionId=%s&__format=xls&__asattachment=true&__overwrite=false", remuIDURL, remuSessionID) viIDURL := fmt.Sprint("https://servicos-portal.mpro.mp.br/plcVis/frameset?__report=..%2FROOT%2Frel%2Fcontracheque%2Fmembros%2FverbasIndenizatoriasMembrosAtivos.rptdesign&anomes=", year, fmt.Sprintf("%02d", month)) - viSessionID, err := seasonID(viIDURL) + benefitsSessionID, err := getSessionID(viIDURL) if err != nil { - return urlRequests{}, err + return requestURLs{}, err } - viDownloadURL := fmt.Sprint(viIDURL, fmt.Sprint("&__sessionId=%s&__format=xls&__asattachment=true&__overwrite=false", viSessionID)) + benefitsURL := fmt.Sprintf("%s&__sessionId=%s&__format=xls&__asattachment=true&__overwrite=false", viIDURL, benefitsSessionID) - return urlRequests{remuDownloadURL, viDownloadURL}, nil + return requestURLs{remuDownloadURL, benefitsURL}, nil } // Inicializa o id de sessão para uma dada url -func seasonID(url string) (string, error) { +func getSessionID(url string) (string, error) { resp, err := http.Get(url) if err != nil { return "", status.NewError(status.ConnectionError, fmt.Errorf("Was not possible to get a season id to the url: %s. %q", url, err)) @@ -54,35 +54,42 @@ func seasonID(url string) (string, error) { } id := strings.Split(string(page), "Constants.viewingSessionId = \"") - seasonId := id[1][0:19] - return seasonId, err + return id[1][0:19], err } -func download(url string, filePath string) error { +func download(url string, filePath string, outputPath string) error { resp, err := http.Get(url) if err != nil { return status.NewError(status.ConnectionError, fmt.Errorf("Problem doing GET on the URL(%s) to download the file(%s). Error: %q", url, filePath, err)) } defer resp.Body.Close() + _, err = os.Stat(outputPath) + if os.IsNotExist(err) { + err = os.Mkdir(outputPath, 0755) + if err != nil { + return status.NewError(status.SystemError, fmt.Errorf("Error creating outputfolder (%s). Error: %q", outputPath, err)) + } + } + file, err := os.Create(filePath) if err != nil { return status.NewError(status.DataUnavailable, fmt.Errorf("Error creating downloaded (%s) file(%s). Error: %q", url, filePath, err)) } defer file.Close() - _, erro := io.Copy(file, resp.Body) - if erro != nil { - return status.NewError(status.SystemError, fmt.Errorf("Was not possible to save the downloaded file: %s. The following mistake was teken: %q", filePath, erro)) + if _, err := io.Copy(file, resp.Body); err != nil { + return status.NewError(status.SystemError, fmt.Errorf("Was not possible to save the downloaded file: %s. The following mistake was teken: %q", filePath, err)) } + return nil } func Crawl(month int, year int, outputPath string) ([]string, error) { var paths []string - request, err := requestURL(year, month) + request, err := getRequestURLs(year, month) if err != nil { return paths, err } @@ -90,20 +97,20 @@ func Crawl(month int, year int, outputPath string) ([]string, error) { for typ := 0; typ < 2; typ++ { switch typ { case remuURLType: - var fileName = fmt.Sprint("%d", "_", "%02d", "_remu", year, month) - var filePath = fmt.Sprint(fileName, ".xls") + var fileName = fmt.Sprintf("%d_%02d_remu.xls", year, month) + var filePath = fmt.Sprint(outputPath, "/", fileName) - err = download(request.remuDownloadURL, filePath) + err = download(request.remunerationURL, filePath, outputPath) if err != nil { return paths, err } paths = append(paths, filePath) case viURLType: - var fileName = fmt.Sprintf("%d", "_", "%02d", "_vi", year, month) - var filePath = fmt.Sprintf(fileName, ".xls") + var fileName = fmt.Sprintf("%d_%02d_vi.xls", year, month) + var filePath = fmt.Sprint(outputPath, "/", fileName) - err = download(request.viDownloadURL, filePath) + err = download(request.benefitsURL, filePath, outputPath) if err != nil { return paths, err } diff --git a/main.go b/main.go index 939de95..5c4f70a 100644 --- a/main.go +++ b/main.go @@ -14,7 +14,7 @@ import ( type Environment struct { Month int `envconfig:"MONTH" required:"true"` Year int `envconfig:"YEAR" required:"true"` - OutputFolder string `envconfig:"OUTPUT_FOLDER" default:"/output"` + OutputFolder string `envconfig:"OUTPUT_FOLDER" default:"./output"` GitCommit string `envconfig:"GIT_COMMIT" required:"true"` }