From f9f6a0fbef12191644ebfac5a04583ab9e61de13 Mon Sep 17 00:00:00 2001 From: Mathias Lafeldt Date: Tue, 17 Aug 2021 13:59:41 +0200 Subject: [PATCH] Extract scraping --- get-strip/lambda.rs | 98 ++++++++++++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 33 deletions(-) diff --git a/get-strip/lambda.rs b/get-strip/lambda.rs index 1c08516..c0fe304 100644 --- a/get-strip/lambda.rs +++ b/get-strip/lambda.rs @@ -4,7 +4,7 @@ use log::{debug, info}; use select::document::Document; use select::predicate::Class; use serde::{Deserialize, Serialize}; -// use serde_json::Value; +use serde_json::json; // use std::env; #[derive(Deserialize, Debug)] @@ -14,10 +14,8 @@ struct Input { #[derive(Serialize, PartialEq, Debug)] struct Output { - date: String, - title: String, - image_url: String, - strip_url: String, + #[serde(flatten)] + comic: Comic, upload_url: String, } @@ -27,15 +25,17 @@ async fn main() -> Result<(), Error> { simple_logger::init_with_env()?; // lambda_runtime::run(handler_fn(handler)).await?; info!( - "{:?}", - handler( - Input { - date: Some("2000-07-15".to_string()), - // date: None, - }, - Context::default(), + "{}", + json!( + handler( + Input { + date: Some("2000-07-15".to_string()), + // date: None, + }, + Context::default(), + ) + .await? ) - .await? ); Ok(()) } @@ -43,28 +43,60 @@ async fn main() -> Result<(), Error> { async fn handler(input: Input, _: Context) -> Result { debug!("Got input: {:?}", input); - let date = input.date.unwrap_or_else(|| { - let now = chrono::Utc::now(); - format!("{}-{:02}-{:02}", now.year(), now.month(), now.day()) - }); - - let base_url = "https://dilbert.com"; - let strip_url = format!("{}/strip/{}", base_url, date); - - let resp = reqwest::get(&strip_url).await?.error_for_status()?; - let body = resp.text().await?; - let document = Document::from(body.as_ref()); - let container = document.find(Class("comic-item-container")).next().unwrap(); - - info!("{}", container.attr("data-id").unwrap()); - let title = container.attr("data-title").unwrap(); - let image_url = container.attr("data-image").unwrap(); + let comic = Dilbert::default().scrape_comic(input.date).await?; Ok(Output { - date: date, - title: title.to_string(), - image_url: image_url.to_string(), - strip_url: strip_url, + comic, upload_url: "".to_string(), }) } + +#[derive(Serialize, Deserialize, PartialEq, Debug)] +struct Comic { + date: String, + title: String, + image_url: String, + strip_url: String, +} + +struct Dilbert { + base_url: String, +} + +impl Default for Dilbert { + fn default() -> Self { + Dilbert { + base_url: "https://dilbert.com".to_string(), + } + } +} + +impl Dilbert { + pub fn new(base_url: String) -> Self { + Self { base_url } + } + + pub async fn scrape_comic(self, date: Option) -> Result { + let date = date.unwrap_or_else(|| { + let now = chrono::Utc::now(); + format!("{}-{:02}-{:02}", now.year(), now.month(), now.day()) + }); + + let strip_url = format!("{}/strip/{}", self.base_url, date); + let resp = reqwest::get(&strip_url).await?.error_for_status()?; + let body = resp.text().await?; + let document = Document::from(body.as_ref()); + let container = document.find(Class("comic-item-container")).next().unwrap(); + + // info!("{}", container.attr("data-id").unwrap()); + let title = container.attr("data-title").unwrap(); + let image_url = container.attr("data-image").unwrap(); + + Ok(Comic { + date, + title: title.to_string(), + image_url: image_url.to_string(), + strip_url, + }) + } +}