From c0cb343db425352a1e01c1d0bc097c41d6630d23 Mon Sep 17 00:00:00 2001 From: Rick <37203861+rickhehe@users.noreply.github.com> Date: Mon, 7 Jan 2019 23:15:23 +1300 Subject: [PATCH] Pcc42 (#429) * PCC42 rickhehe * PCC42 passed my own test --- 42/rickhehe/regex.py | 49 +++++++++++++++++++++++++++++++++++++++ 42/rickhehe/test_regex.py | 40 ++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 42/rickhehe/regex.py create mode 100644 42/rickhehe/test_regex.py diff --git a/42/rickhehe/regex.py b/42/rickhehe/regex.py new file mode 100644 index 000000000..8917b4baa --- /dev/null +++ b/42/rickhehe/regex.py @@ -0,0 +1,49 @@ +import re + + +def extract_course_times(): + '''Use re.findall to capture all mm:ss timestamps in a list''' + flask_course = ('Introduction 1 Lecture 01:47' + 'The Basics 4 Lectures 32:03' + 'Getting Technical! 4 Lectures 41:51' + 'Challenge 2 Lectures 27:48' + 'Afterword 1 Lecture 05:02') + return re.findall(r'\d{2}:\d{2}', flask_course) + +#print(extract_course_times()) + +def split_on_multiple_chars(): + '''Use re.split to split log line by ; , . + but not on the last ... so list should have len of 4 + (hint check re.split docs for extra switches)''' + logline = ('2017-11-03T01:00:02;challenge time,regex!.' + 'hope you join ... soon') + return re.split(r'[;,.]', logline, maxsplit = 3) +#print(len(split_on_multiple_chars())) + +def get_all_hashtags_and_links(): + '''Use re.findall to extract the URL and 2 hashtags of this tweet''' + tweet = ('New PyBites article: Module of the Week - Requests-cache ' + 'for Repeated API Calls - http://pybit.es/requests-cache.html ' + '#python #APIs') + return re.findall(r'http\S+|#\S+', tweet) #alternatively use (?:http|#)\S+, complete string will be returned as no captured groups. +#print(get_all_hashtags_and_links()) + +def match_first_paragraph(): + '''Use re.sub to extract the content of the first paragraph (excl tags)''' + html = ('

pybites != greedy

' + '

not the same can be said REgarding ...

') + return re.sub(r'.*?

(.*?)<.+', r'\1', html) +#print(match_first_paragraph()) + + +def find_double_words(): + '''Use re.search(regex, text).group() to find the double word''' + text = 'Spain is so nice in the the spring' + return re.search(r'\b(\w+)\s+\1\b', text).group() +#print(find_double_words()) + +def match_ip_v4_address(ip): + '''Use re.match to match an ip v4 address (no need for exact IP ranges)''' + return re.match(r'\d{,3}\.\d{,3}\.\d{,3}\.\d{,3}', ip) +#print(match_ip_v4_address('192.168.0.1')) diff --git a/42/rickhehe/test_regex.py b/42/rickhehe/test_regex.py new file mode 100644 index 000000000..35403974f --- /dev/null +++ b/42/rickhehe/test_regex.py @@ -0,0 +1,40 @@ +from regex import (extract_course_times, split_on_multiple_chars, + get_all_hashtags_and_links, match_first_paragraph, + find_double_words, match_ip_v4_address) + + +def test_extract_course_times(): + expected = ['01:47', '32:03', '41:51', '27:48', '05:02'] + assert extract_course_times() == expected + + +def test_split_on_multiple_chars(): + expected = ['2017-11-03T01:00:02', 'challenge time', + 'regex!', 'hope you join ... soon'] + assert split_on_multiple_chars() == expected + + +def test_get_all_hashtags_and_links(): + expected = ['http://pybit.es/requests-cache.html', '#python', '#APIs'] + assert get_all_hashtags_and_links() == expected + + +def test_match_first_paragraph(): + expected = 'pybites != greedy' + assert match_first_paragraph() == expected + + +def test_find_double_words(): + expected = 'the the' + assert find_double_words() == expected + + +def test_match_ip_address(): + valid_ips = ['1.1.1.1', '255.255.255.255', '192.168.1.1', + '10.10.1.1', '132.254.111.10', '26.10.2.10', + '127.0.0.1'] + bad_ips = ['10.10.10', '10.10', '10', 'a.a.a.a', '10.0.0.a'] + for valid_ip in valid_ips: + assert match_ip_v4_address(valid_ip) + for bad_ip in bad_ips: + assert match_ip_v4_address(bad_ip) is None