From 75919d7b7d028fe279a1ed2d2b2af53e65647f4b Mon Sep 17 00:00:00 2001 From: Nahuel Sotelo Date: Sun, 26 Apr 2020 12:18:45 -0300 Subject: [PATCH 1/3] No need for dotenv preload --- diffengine/__init__.py | 44 +++++++++++++++++++++++------------------- requirements.txt | 1 + test_diffengine.py | 33 +++++++++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 22 deletions(-) diff --git a/diffengine/__init__.py b/diffengine/__init__.py index d728efb..123a63c 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -24,6 +24,7 @@ import subprocess import readability import unicodedata +import yaml from peewee import * from playhouse.migrate import SqliteMigrator, migrate @@ -31,6 +32,7 @@ from selenium import webdriver from urllib.parse import urlparse, urlunparse, parse_qs, urlencode from selenium.webdriver.firefox.options import Options as FirefoxOptions +from envyaml import EnvYAML home = None config = {} @@ -47,7 +49,7 @@ class Feed(BaseModel): url = CharField(primary_key=True) name = CharField() created = DateTimeField(default=datetime.utcnow) - + @property def entries(self): return (Entry.select() @@ -70,7 +72,7 @@ def get_latest(self): return 0 count = 0 for e in feed.entries: - # note: look up with url only, because there may be + # note: look up with url only, because there may be # overlap bewteen feeds, especially when a large newspaper # has multiple feeds entry, created = Entry.get_or_create(url=e.link) @@ -78,7 +80,7 @@ def get_latest(self): FeedEntry.create(entry=entry, feed=self) logging.info("found new entry: %s", e.link) count += 1 - elif len(entry.feeds.where(Feed.url == self.url)) == 0: + elif len(entry.feeds.where(Feed.url == self.url)) == 0: FeedEntry.create(entry=entry, feed=self) logging.debug("found entry from another feed: %s", e.link) count += 1 @@ -98,10 +100,10 @@ def feeds(self): .join(Entry) .where(Entry.id==self.id)) - @property + @property def stale(self): """ - A heuristic for checking new content very often, and checking + A heuristic for checking new content very often, and checking older content less frequently. If an entry is deemed stale then it is worth checking again to see if the content has changed. """ @@ -131,10 +133,10 @@ def stale(self): def get_latest(self): """ - get_latest is the heart of the application. It will get the current - version on the web, extract its summary with readability and compare - it against a previous version. If a difference is found it will - compute the diff, save it as html and png files, and tell Internet + get_latest is the heart of the application. It will get the current + version on the web, extract its summary with readability and compare + it against a previous version. If a difference is found it will + compute the diff, save it as html and png files, and tell Internet Archive to create a snapshot. If a new version was found it will be returned, otherwise None will @@ -172,7 +174,7 @@ def get_latest(self): else: old = versions[0] - # compare what we got against the latest version and create a + # compare what we got against the latest version and create a # new version if it looks different, or is brand new (no old version) new = None @@ -255,7 +257,7 @@ def archive(self): self.save() return self.archive_url else: - logging.error("unable to get archive url from %s [%s]: %s", + logging.error("unable to get archive url from %s [%s]: %s", save_url, resp.status_code, resp.headers) except Exception as e: @@ -358,7 +360,7 @@ def load_config(prompt=True): global config config_file = os.path.join(home, "config.yaml") if os.path.isfile(config_file): - config = yaml.load(open(config_file), Loader=yaml.FullLoader) + config = EnvYAML(config_file) else: if not os.path.isdir(home): os.makedirs(home) @@ -472,6 +474,8 @@ def tweet_diff(diff, token): def init(new_home, prompt=True): global home home = new_home + env_path = "%s/.env" % new_home + load_dotenv(dotenv_path=env_path) load_config(prompt) setup_browser() setup_logging() @@ -486,7 +490,7 @@ def main(): init(home) start_time = datetime.utcnow() logging.info("starting up with home=%s", home) - + checked = skipped = new = 0 for f in config.get('feeds', []): @@ -496,7 +500,7 @@ def main(): # get latest feed entries feed.get_latest() - + # get latest content for each entry for entry in feed.entries: if not entry.stale: @@ -514,7 +518,7 @@ def main(): tweet_diff(version.diff, f['twitter']) elapsed = datetime.utcnow() - start_time - logging.info("shutting down: new=%s checked=%s skipped=%s elapsed=%s", + logging.info("shutting down: new=%s checked=%s skipped=%s elapsed=%s", new, checked, skipped, elapsed) browser.quit() @@ -530,7 +534,7 @@ def _normal(s): s = s.replace('”', '"') s = s.replace("’", "'") s = s.replace("\n", " ") - s = s.replace("­", "") + s = s.replace("­", "") s = re.sub(r' +', ' ', s) s = s.strip() return s @@ -538,12 +542,12 @@ def _normal(s): def _equal(s1, s2): return _fingerprint(s1) == _fingerprint(s2) -punctuation = dict.fromkeys(i for i in range(sys.maxunicode) +punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')) def _fingerprint(s): - # make sure the string has been normalized, bleach everything, remove all - # whitespace and punctuation to create a pseudo fingerprint for the text + # make sure the string has been normalized, bleach everything, remove all + # whitespace and punctuation to create a pseudo fingerprint for the text # for use during comparison s = _normal(s) s = bleach.clean(s, tags=[], strip=True) @@ -566,7 +570,7 @@ def _remove_utm(url): def _get(url, allow_redirects=True): return requests.get( - url, + url, timeout=60, headers={"User-Agent": UA}, allow_redirects=allow_redirects diff --git a/requirements.txt b/requirements.txt index c71d0d0..cd787cb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ selenium feedparser readability-lxml https://github.com/edsu/htmldiff/tarball/master#egg=htmldiff-0.2 +envyaml>=0.1912 diff --git a/test_diffengine.py b/test_diffengine.py index b2713a9..c584ce0 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -10,7 +10,8 @@ shutil.rmtree("test") # set things up but disable prompting for initial feed -init("test", prompt=False) +test_home = "test" +init(test_home, prompt=False) # the sequence of these tests is significant @@ -74,7 +75,7 @@ def test_html_diff(): def test_many_to_many(): # these two feeds share this entry, we want diffengine to support - # multiple feeds for the same content, which is fairly common at + # multiple feeds for the same content, which is fairly common at # large media organizations with multiple topical feeds url="https://www.washingtonpost.com/classic-apps/how-a-week-of-tweets-by-trump-stoked-anxiety-moved-markets-and-altered-plans/2017/01/07/38be8e64-d436-11e6-9cb0-54ab630851e8_story.html" @@ -116,3 +117,31 @@ def test_fingerprint(): assert _fingerprint("foo
bar") == "foobar" assert _fingerprint("foo'bar") == "foobar" assert _fingerprint("foo’bar") == "foobar" + +def test_environment_vars_in_config_file(): + # Test values + public_value = "public value" + private_yaml_key = "${PRIVATE_VAR}" + private_value = "private value" + + # Create dot env that that will read + dotenv_file = open(".env","w+") + dotenv_file.write("PRIVATE_VAR=%s\n" % private_value) + + # Create config.yaml that will be read + test_config = { + "example": { + "private_value": private_yaml_key, + "public_value": public_value + } + } + config_file = home_path(test_home, "config.yaml"); + yaml.dump(test_config, open(config_file, "w"), default_flow_style=False) + + # Test! + init("test") + config = get_initial_config() + assert config['example']['public_value'] == public_value + assert config['example']['private_value'] != private_yaml_key + assert config['example']['private_value'] == private_value + From c4e98d3a9ed8849cb8d781a61acee15735604d6b Mon Sep 17 00:00:00 2001 From: Nahuel Sotelo Date: Sun, 26 Apr 2020 12:20:13 -0300 Subject: [PATCH 2/3] No need for dotenv preload --- diffengine/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/diffengine/__init__.py b/diffengine/__init__.py index 123a63c..7dadbee 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -474,8 +474,6 @@ def tweet_diff(diff, token): def init(new_home, prompt=True): global home home = new_home - env_path = "%s/.env" % new_home - load_dotenv(dotenv_path=env_path) load_config(prompt) setup_browser() setup_logging() From 6fa8df2b4e3af09e5e1af769cb69bb3848f3f3ce Mon Sep 17 00:00:00 2001 From: Nahuel Sotelo Date: Sun, 26 Apr 2020 12:39:53 -0300 Subject: [PATCH 3/3] Docs --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index 8eb5f48..f506e29 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,27 @@ twitter: consumer_secret: CONSUMER_SECRET ``` +### Support for environment vars + +The configuration file has support for [environment variables](https://medium.com/chingu/an-introduction-to-environment-variables-and-how-to-use-them-f602f66d15fa). This is useful if you want to keeping your credentials secure when deploying to Heroku, Vercel (former ZEIT Now), AWS, Azure, Google Cloud or any other similar services. The environment variables are defined on the app of the platform you use or directly in a [dotenv file](https://12factor.net/config), which is the usual case when coding locally. + +For instance, say you want to keep your Twitter credentials safe. You'd keep a reference to it in the `config.yaml` this way: + +```yml +twitter: + consumer_key: "${MY_CONSUMER_KEY_ENV_VAR}" + consumer_secret: "${MY_CONSUMER_SECRET_ENV_VAR}" +``` + +Then you would define your environment variables `MY_CONSUMER_KEY_ENV_VAR` and `MY_CONSUMER_SECRET_ENV_VAR` in your `.env` file: + +```dotenv +MY_CONSUMER_KEY_ENV_VAR="CONSUMER_KEY" +MY_CONSUMER_SECRET_ENV_VAR="CONSUMER_SECRET" +``` + +Done! You can use diffengine as usual and keep your credentials safe. + ## Develop [![Build Status](https://travis-ci.org/DocNow/diffengine.svg)](http://travis-ci.org/DocNow/diffengine)