Skip to content

Commit

Permalink
Enable specifying explicit list of external posts to display (alshedi…
Browse files Browse the repository at this point in the history
…vat#2059)

- updates `external-posts.rb` plugin, allowing the user to specify an
explicit lists of urls in `_config.yml` that are then displayed in the
blog feed as external posts
- 99% of the code in this change is written by gpt-4:
https://chat.openai.com/share/24432d24-36a7-4d6f-a5c0-d7e5142f68cd
  • Loading branch information
alshedivat authored May 28, 2024
1 parent aef27ba commit 8511e5d
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 19 deletions.
6 changes: 5 additions & 1 deletion _config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ bing_site_verification: # out your bing-site-verification ID (Bing Webmaster)
blog_name: al-folio # blog_name will be displayed in your blog page
blog_description: a simple whitespace theme for academics
permalink: /blog/:year/:title/
lsi: true # produce an index for related posts
lsi: false # produce an index for related posts

# Pagination
pagination:
Expand Down Expand Up @@ -168,6 +168,10 @@ disqus_shortname: al-folio # put your disqus shortname
external_sources:
- name: medium.com
rss_url: https://medium.com/@al-folio/feed
- name: Google Blog
posts:
- url: https://blog.google/technology/ai/google-gemini-update-flash-ai-assistant-io-2024/
published_date: 2024-05-14

# -----------------------------------------------------------------------------
# Collections
Expand Down
96 changes: 78 additions & 18 deletions _plugins/external-posts.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
require 'feedjira'
require 'httparty'
require 'jekyll'
require 'nokogiri'
require 'time'

module ExternalPosts
class ExternalPostsGenerator < Jekyll::Generator
Expand All @@ -10,27 +12,85 @@ class ExternalPostsGenerator < Jekyll::Generator
def generate(site)
if site.config['external_sources'] != nil
site.config['external_sources'].each do |src|
p "Fetching external posts from #{src['name']}:"
xml = HTTParty.get(src['rss_url']).body
return if xml.nil?
feed = Feedjira.parse(xml)
feed.entries.each do |e|
p "...fetching #{e.url}"
slug = e.title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
path = site.in_source_dir("_posts/#{slug}.md")
doc = Jekyll::Document.new(
path, { :site => site, :collection => site.collections['posts'] }
)
doc.data['external_source'] = src['name'];
doc.data['feed_content'] = e.content;
doc.data['title'] = "#{e.title}";
doc.data['description'] = e.summary;
doc.data['date'] = e.published;
doc.data['redirect'] = e.url;
site.collections['posts'].docs << doc
puts "Fetching external posts from #{src['name']}:"
if src['rss_url']
fetch_from_rss(site, src)
elsif src['posts']
fetch_from_urls(site, src)
end
end
end
end

def fetch_from_rss(site, src)
xml = HTTParty.get(src['rss_url']).body
return if xml.nil?
feed = Feedjira.parse(xml)
process_entries(site, src, feed.entries)
end

def process_entries(site, src, entries)
entries.each do |e|
puts "...fetching #{e.url}"
create_document(site, src['name'], e.url, {
title: e.title,
content: e.content,
summary: e.summary,
published: e.published
})
end
end

def create_document(site, source_name, url, content)
slug = content[:title].downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
path = site.in_source_dir("_posts/#{slug}.md")
doc = Jekyll::Document.new(
path, { :site => site, :collection => site.collections['posts'] }
)
doc.data['external_source'] = source_name
doc.data['title'] = content[:title]
doc.data['feed_content'] = content[:content]
doc.data['description'] = content[:summary]
doc.data['date'] = content[:published]
doc.data['redirect'] = url
site.collections['posts'].docs << doc
end

def fetch_from_urls(site, src)
src['posts'].each do |post|
puts "...fetching #{post['url']}"
content = fetch_content_from_url(post['url'])
content[:published] = parse_published_date(post['published_date'])
create_document(site, src['name'], post['url'], content)
end
end

def parse_published_date(published_date)
case published_date
when String
Time.parse(published_date).utc
when Date
published_date.to_time.utc
else
raise "Invalid date format for #{published_date}"
end
end

def fetch_content_from_url(url)
html = HTTParty.get(url).body
parsed_html = Nokogiri::HTML(html)

title = parsed_html.at('head title')&.text || ''
description = parsed_html.at('head meta[name="description"]')&.attr('content') || ''
body_content = parsed_html.at('body')&.inner_html || ''

{
title: title,
content: body_content,
summary: description
# Note: The published date is now added in the fetch_from_urls method.
}
end

end
end

0 comments on commit 8511e5d

Please sign in to comment.