Skip to content

Commit

Permalink
Merge pull request #67 from nahuelhds/feature/envyaml
Browse files Browse the repository at this point in the history
Environment vars in `config.yaml` file
  • Loading branch information
edsu authored Apr 26, 2020
2 parents 756569d + 6fa8df2 commit 564b26e
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 22 deletions.
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,27 @@ twitter:
consumer_secret: CONSUMER_SECRET
```
### Support for environment vars
The configuration file has support for [environment variables](https://medium.com/chingu/an-introduction-to-environment-variables-and-how-to-use-them-f602f66d15fa). This is useful if you want to keeping your credentials secure when deploying to Heroku, Vercel (former ZEIT Now), AWS, Azure, Google Cloud or any other similar services. The environment variables are defined on the app of the platform you use or directly in a [dotenv file](https://12factor.net/config), which is the usual case when coding locally.
For instance, say you want to keep your Twitter credentials safe. You'd keep a reference to it in the `config.yaml` this way:

```yml
twitter:
consumer_key: "${MY_CONSUMER_KEY_ENV_VAR}"
consumer_secret: "${MY_CONSUMER_SECRET_ENV_VAR}"
```

Then you would define your environment variables `MY_CONSUMER_KEY_ENV_VAR` and `MY_CONSUMER_SECRET_ENV_VAR` in your `.env` file:

```dotenv
MY_CONSUMER_KEY_ENV_VAR="CONSUMER_KEY"
MY_CONSUMER_SECRET_ENV_VAR="CONSUMER_SECRET"
```

Done! You can use diffengine as usual and keep your credentials safe.

## Develop

[![Build Status](https://travis-ci.org/DocNow/diffengine.svg)](http://travis-ci.org/DocNow/diffengine)
Expand Down
42 changes: 22 additions & 20 deletions diffengine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@
import subprocess
import readability
import unicodedata
import yaml

from peewee import *
from playhouse.migrate import SqliteMigrator, migrate
from datetime import datetime, timedelta
from selenium import webdriver
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from envyaml import EnvYAML

home = None
config = {}
Expand All @@ -47,7 +49,7 @@ class Feed(BaseModel):
url = CharField(primary_key=True)
name = CharField()
created = DateTimeField(default=datetime.utcnow)

@property
def entries(self):
return (Entry.select()
Expand All @@ -70,15 +72,15 @@ def get_latest(self):
return 0
count = 0
for e in feed.entries:
# note: look up with url only, because there may be
# note: look up with url only, because there may be
# overlap bewteen feeds, especially when a large newspaper
# has multiple feeds
entry, created = Entry.get_or_create(url=e.link)
if created:
FeedEntry.create(entry=entry, feed=self)
logging.info("found new entry: %s", e.link)
count += 1
elif len(entry.feeds.where(Feed.url == self.url)) == 0:
elif len(entry.feeds.where(Feed.url == self.url)) == 0:
FeedEntry.create(entry=entry, feed=self)
logging.debug("found entry from another feed: %s", e.link)
count += 1
Expand All @@ -98,10 +100,10 @@ def feeds(self):
.join(Entry)
.where(Entry.id==self.id))

@property
@property
def stale(self):
"""
A heuristic for checking new content very often, and checking
A heuristic for checking new content very often, and checking
older content less frequently. If an entry is deemed stale then
it is worth checking again to see if the content has changed.
"""
Expand Down Expand Up @@ -131,10 +133,10 @@ def stale(self):

def get_latest(self):
"""
get_latest is the heart of the application. It will get the current
version on the web, extract its summary with readability and compare
it against a previous version. If a difference is found it will
compute the diff, save it as html and png files, and tell Internet
get_latest is the heart of the application. It will get the current
version on the web, extract its summary with readability and compare
it against a previous version. If a difference is found it will
compute the diff, save it as html and png files, and tell Internet
Archive to create a snapshot.
If a new version was found it will be returned, otherwise None will
Expand Down Expand Up @@ -172,7 +174,7 @@ def get_latest(self):
else:
old = versions[0]

# compare what we got against the latest version and create a
# compare what we got against the latest version and create a
# new version if it looks different, or is brand new (no old version)
new = None

Expand Down Expand Up @@ -255,7 +257,7 @@ def archive(self):
self.save()
return self.archive_url
else:
logging.error("unable to get archive url from %s [%s]: %s",
logging.error("unable to get archive url from %s [%s]: %s",
save_url, resp.status_code, resp.headers)

except Exception as e:
Expand Down Expand Up @@ -358,7 +360,7 @@ def load_config(prompt=True):
global config
config_file = os.path.join(home, "config.yaml")
if os.path.isfile(config_file):
config = yaml.load(open(config_file), Loader=yaml.FullLoader)
config = EnvYAML(config_file)
else:
if not os.path.isdir(home):
os.makedirs(home)
Expand Down Expand Up @@ -486,7 +488,7 @@ def main():
init(home)
start_time = datetime.utcnow()
logging.info("starting up with home=%s", home)

checked = skipped = new = 0

for f in config.get('feeds', []):
Expand All @@ -496,7 +498,7 @@ def main():

# get latest feed entries
feed.get_latest()

# get latest content for each entry
for entry in feed.entries:
if not entry.stale:
Expand All @@ -514,7 +516,7 @@ def main():
tweet_diff(version.diff, f['twitter'])

elapsed = datetime.utcnow() - start_time
logging.info("shutting down: new=%s checked=%s skipped=%s elapsed=%s",
logging.info("shutting down: new=%s checked=%s skipped=%s elapsed=%s",
new, checked, skipped, elapsed)

browser.quit()
Expand All @@ -530,20 +532,20 @@ def _normal(s):
s = s.replace('”', '"')
s = s.replace("’", "'")
s = s.replace("\n", " ")
s = s.replace("­", "")
s = s.replace("­", "")
s = re.sub(r' +', ' ', s)
s = s.strip()
return s

def _equal(s1, s2):
return _fingerprint(s1) == _fingerprint(s2)

punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith('P'))

def _fingerprint(s):
# make sure the string has been normalized, bleach everything, remove all
# whitespace and punctuation to create a pseudo fingerprint for the text
# make sure the string has been normalized, bleach everything, remove all
# whitespace and punctuation to create a pseudo fingerprint for the text
# for use during comparison
s = _normal(s)
s = bleach.clean(s, tags=[], strip=True)
Expand All @@ -566,7 +568,7 @@ def _remove_utm(url):

def _get(url, allow_redirects=True):
return requests.get(
url,
url,
timeout=60,
headers={"User-Agent": UA},
allow_redirects=allow_redirects
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ selenium
feedparser
readability-lxml
https://github.com/edsu/htmldiff/tarball/master#egg=htmldiff-0.2
envyaml>=0.1912
33 changes: 31 additions & 2 deletions test_diffengine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
shutil.rmtree("test")

# set things up but disable prompting for initial feed
init("test", prompt=False)
test_home = "test"
init(test_home, prompt=False)

# the sequence of these tests is significant

Expand Down Expand Up @@ -74,7 +75,7 @@ def test_html_diff():
def test_many_to_many():

# these two feeds share this entry, we want diffengine to support
# multiple feeds for the same content, which is fairly common at
# multiple feeds for the same content, which is fairly common at
# large media organizations with multiple topical feeds
url="https://www.washingtonpost.com/classic-apps/how-a-week-of-tweets-by-trump-stoked-anxiety-moved-markets-and-altered-plans/2017/01/07/38be8e64-d436-11e6-9cb0-54ab630851e8_story.html"

Expand Down Expand Up @@ -116,3 +117,31 @@ def test_fingerprint():
assert _fingerprint("foo<br>bar") == "foobar"
assert _fingerprint("foo'bar") == "foobar"
assert _fingerprint("foo’bar") == "foobar"

def test_environment_vars_in_config_file():
# Test values
public_value = "public value"
private_yaml_key = "${PRIVATE_VAR}"
private_value = "private value"

# Create dot env that that will read
dotenv_file = open(".env","w+")
dotenv_file.write("PRIVATE_VAR=%s\n" % private_value)

# Create config.yaml that will be read
test_config = {
"example": {
"private_value": private_yaml_key,
"public_value": public_value
}
}
config_file = home_path(test_home, "config.yaml");
yaml.dump(test_config, open(config_file, "w"), default_flow_style=False)

# Test!
init("test")
config = get_initial_config()
assert config['example']['public_value'] == public_value
assert config['example']['private_value'] != private_yaml_key
assert config['example']['private_value'] == private_value

0 comments on commit 564b26e

Please sign in to comment.