Skip to content
This repository has been archived by the owner on Jun 3, 2023. It is now read-only.

Commit

Permalink
Extract scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
mlafeldt committed Aug 17, 2021
1 parent 93e567d commit f9f6a0f
Showing 1 changed file with 65 additions and 33 deletions.
98 changes: 65 additions & 33 deletions get-strip/lambda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use log::{debug, info};
use select::document::Document;
use select::predicate::Class;
use serde::{Deserialize, Serialize};
// use serde_json::Value;
use serde_json::json;
// use std::env;

#[derive(Deserialize, Debug)]
Expand All @@ -14,10 +14,8 @@ struct Input {

#[derive(Serialize, PartialEq, Debug)]
struct Output {
date: String,
title: String,
image_url: String,
strip_url: String,
#[serde(flatten)]
comic: Comic,

upload_url: String,
}
Expand All @@ -27,44 +25,78 @@ async fn main() -> Result<(), Error> {
simple_logger::init_with_env()?;
// lambda_runtime::run(handler_fn(handler)).await?;
info!(
"{:?}",
handler(
Input {
date: Some("2000-07-15".to_string()),
// date: None,
},
Context::default(),
"{}",
json!(
handler(
Input {
date: Some("2000-07-15".to_string()),
// date: None,
},
Context::default(),
)
.await?
)
.await?
);
Ok(())
}

async fn handler(input: Input, _: Context) -> Result<Output, Error> {
debug!("Got input: {:?}", input);

let date = input.date.unwrap_or_else(|| {
let now = chrono::Utc::now();
format!("{}-{:02}-{:02}", now.year(), now.month(), now.day())
});

let base_url = "https://dilbert.com";
let strip_url = format!("{}/strip/{}", base_url, date);

let resp = reqwest::get(&strip_url).await?.error_for_status()?;
let body = resp.text().await?;
let document = Document::from(body.as_ref());
let container = document.find(Class("comic-item-container")).next().unwrap();

info!("{}", container.attr("data-id").unwrap());
let title = container.attr("data-title").unwrap();
let image_url = container.attr("data-image").unwrap();
let comic = Dilbert::default().scrape_comic(input.date).await?;

Ok(Output {
date: date,
title: title.to_string(),
image_url: image_url.to_string(),
strip_url: strip_url,
comic,
upload_url: "".to_string(),
})
}

#[derive(Serialize, Deserialize, PartialEq, Debug)]
struct Comic {
date: String,
title: String,
image_url: String,
strip_url: String,
}

struct Dilbert {
base_url: String,
}

impl Default for Dilbert {
fn default() -> Self {
Dilbert {
base_url: "https://dilbert.com".to_string(),
}
}
}

impl Dilbert {
pub fn new(base_url: String) -> Self {
Self { base_url }
}

pub async fn scrape_comic(self, date: Option<String>) -> Result<Comic, Error> {
let date = date.unwrap_or_else(|| {
let now = chrono::Utc::now();
format!("{}-{:02}-{:02}", now.year(), now.month(), now.day())
});

let strip_url = format!("{}/strip/{}", self.base_url, date);
let resp = reqwest::get(&strip_url).await?.error_for_status()?;
let body = resp.text().await?;
let document = Document::from(body.as_ref());
let container = document.find(Class("comic-item-container")).next().unwrap();

// info!("{}", container.attr("data-id").unwrap());
let title = container.attr("data-title").unwrap();
let image_url = container.attr("data-image").unwrap();

Ok(Comic {
date,
title: title.to_string(),
image_url: image_url.to_string(),
strip_url,
})
}
}

0 comments on commit f9f6a0f

Please sign in to comment.