From 417ee9e302e3b5edcf94e55bae4b06d4f9080796 Mon Sep 17 00:00:00 2001 From: mlazana Date: Sat, 23 Mar 2019 21:27:41 +0200 Subject: [PATCH 01/13] add env variable URL_BLACKLIST --- archivebox/config.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/archivebox/config.py b/archivebox/config.py index d8e01b24..791c51a7 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -47,6 +47,8 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget') YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') CHROME_BINARY = os.getenv('CHROME_BINARY', None) +URL_BLACKLIST = os.getenv('URL_BLACKLIST', '.*youtube.com.*,.*facebook.com/.*,.*.exe') + try: OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) except Exception: @@ -265,3 +267,10 @@ except KeyboardInterrupt: except: print('[X] There was an error during the startup procedure, your archive data is unaffected.') raise + +URL_BLACKLIST = re.compile( + r'(.*\.youtube\.com)|' + r'(.*\.amazon\.com)|' + r'(.*\.reddit\.com)', + re.IGNORECASE, + ) \ No newline at end of file From 4d1056847750e5ba2aa1cee0800c43ceb68e1bea Mon Sep 17 00:00:00 2001 From: mlazana Date: Sun, 24 Mar 2019 14:40:26 +0200 Subject: [PATCH 02/13] exclude links that are in blacklist --- archivebox/config.py | 3 ++- archivebox/links.py | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 791c51a7..7235e7ca 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -47,7 +47,7 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget') YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') CHROME_BINARY = os.getenv('CHROME_BINARY', None) -URL_BLACKLIST = os.getenv('URL_BLACKLIST', '.*youtube.com.*,.*facebook.com/.*,.*.exe') +URL_BLACKLIST = os.getenv('URL_BLACKLIST', '.*youtube.com.*,.*facebook.com/.*,.*.exe') try: OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) @@ -270,6 +270,7 @@ except: URL_BLACKLIST = re.compile( r'(.*\.youtube\.com)|' + r'(.*\.facebook\.com)|' r'(.*\.amazon\.com)|' r'(.*\.reddit\.com)', re.IGNORECASE, diff --git a/archivebox/links.py b/archivebox/links.py index ba8057a5..8ca9df94 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -28,13 +28,19 @@ from util import ( check_links_structure, ) +from config import ( + URL_BLACKLIST, +) def validate_links(links): check_links_structure(links) links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links(links) # deterministically sort the links based on timstamp, url - + links = exclude_links(links) # exclude links that are in blacklist + + print(links) + if not links: print('[X] No links found :(') raise SystemExit(1) @@ -42,7 +48,8 @@ def validate_links(links): for link in links: link['title'] = unescape(link['title'].strip()) if link['title'] else None check_link_structure(link) - + + print("FINAL LIST", list(links)) return list(links) @@ -115,3 +122,10 @@ def lowest_uniq_timestamp(used_timestamps, timestamp): new_timestamp = '{}.{}'.format(timestamp, nonce) return new_timestamp + +def exclude_links(links): + """ exclude links that are in blacklist""" + + links = [link for link in links if not URL_BLACKLIST.match(link['url'])] + + return links \ No newline at end of file From 81d846427e95a80cc92bac0b28f04c2e8d06ccf3 Mon Sep 17 00:00:00 2001 From: mlazana Date: Sun, 24 Mar 2019 19:04:22 +0200 Subject: [PATCH 03/13] fix comments in links.py --- archivebox/links.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/archivebox/links.py b/archivebox/links.py index 8ca9df94..fd8985ec 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -38,9 +38,7 @@ def validate_links(links): links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links(links) # deterministically sort the links based on timstamp, url links = exclude_links(links) # exclude links that are in blacklist - - print(links) - + if not links: print('[X] No links found :(') raise SystemExit(1) @@ -49,7 +47,6 @@ def validate_links(links): link['title'] = unescape(link['title'].strip()) if link['title'] else None check_link_structure(link) - print("FINAL LIST", list(links)) return list(links) @@ -124,7 +121,7 @@ def lowest_uniq_timestamp(used_timestamps, timestamp): return new_timestamp def exclude_links(links): - """ exclude links that are in blacklist""" + """exclude links that are in blacklist""" links = [link for link in links if not URL_BLACKLIST.match(link['url'])] From 8502fa5cc3aa608a546fd93483f113a826b02332 Mon Sep 17 00:00:00 2001 From: mlazana Date: Wed, 27 Mar 2019 20:10:05 +0200 Subject: [PATCH 04/13] config.py: update function exclude_blacklisted(links) --- archivebox/links.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/archivebox/links.py b/archivebox/links.py index fd8985ec..5eff61f4 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -34,11 +34,11 @@ from config import ( def validate_links(links): check_links_structure(links) - links = archivable_links(links) # remove chrome://, about:, mailto: etc. - links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls - links = sorted_links(links) # deterministically sort the links based on timstamp, url - links = exclude_links(links) # exclude links that are in blacklist - + links = archivable_links(links) # remove chrome://, about:, mailto: etc. + links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls + links = sorted_links(links) # deterministically sort the links based on timstamp, url + links = list(exclude_links(links)) # exclude URLs that match the blacklisted url pattern regex + if not links: print('[X] No links found :(') raise SystemExit(1) @@ -46,7 +46,7 @@ def validate_links(links): for link in links: link['title'] = unescape(link['title'].strip()) if link['title'] else None check_link_structure(link) - + return list(links) @@ -120,9 +120,8 @@ def lowest_uniq_timestamp(used_timestamps, timestamp): return new_timestamp -def exclude_links(links): - """exclude links that are in blacklist""" - - links = [link for link in links if not URL_BLACKLIST.match(link['url'])] - - return links \ No newline at end of file +def exclude_blacklisted(links): + """exclude URLs that match the blacklisted url pattern regex""" + return (link for link in links if not URL_BLACKLIST.match(link['url'])) + + \ No newline at end of file From 380ba5e4567b1ffd2f18f71d3d436f3a50ba9496 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 28 Mar 2019 17:06:33 -0400 Subject: [PATCH 05/13] Create documentation_change.md --- .github/ISSUE_TEMPLATE/documentation_change.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/documentation_change.md diff --git a/.github/ISSUE_TEMPLATE/documentation_change.md b/.github/ISSUE_TEMPLATE/documentation_change.md new file mode 100644 index 00000000..91afc2ca --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation_change.md @@ -0,0 +1,15 @@ +--- +name: Documentation Change +about: Submit a suggestion for the Wiki documentation +title: '' +labels: '' +assignees: '' + +--- + +## Wiki Page URL + + +## Suggested Edit + +... From 157941fc284929864290f317fd8c30f74bba8083 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 28 Mar 2019 17:11:43 -0400 Subject: [PATCH 06/13] Update bug_report.md --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index f2fea3ba..b350fb28 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,5 +1,5 @@ --- -name: Bug report +name: 🐞 Bug report about: Create a report to help us improve title: '' labels: '' From 39f04ce5f6db254c1111da0bcf38d243ccc78152 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 28 Mar 2019 17:11:58 -0400 Subject: [PATCH 07/13] Update documentation_change.md --- .github/ISSUE_TEMPLATE/documentation_change.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/documentation_change.md b/.github/ISSUE_TEMPLATE/documentation_change.md index 91afc2ca..05c01345 100644 --- a/.github/ISSUE_TEMPLATE/documentation_change.md +++ b/.github/ISSUE_TEMPLATE/documentation_change.md @@ -1,5 +1,5 @@ --- -name: Documentation Change +name: 📑 Documentation Change about: Submit a suggestion for the Wiki documentation title: '' labels: '' From 0e1e86039365b200bf8ec143acbfa962a0a7a5d5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 28 Mar 2019 17:12:25 -0400 Subject: [PATCH 08/13] Update feature_request.md --- .github/ISSUE_TEMPLATE/feature_request.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 4489f7dc..0f9423f5 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,5 +1,5 @@ --- -name: Feature request +name: 💡 Feature request about: Suggest an idea for this project title: '' labels: '' From 28334118539864b79ccdf916933f2bf99fe4a6a2 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 28 Mar 2019 17:13:01 -0400 Subject: [PATCH 09/13] Update documentation_change.md --- .github/ISSUE_TEMPLATE/documentation_change.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/documentation_change.md b/.github/ISSUE_TEMPLATE/documentation_change.md index 05c01345..dc3c2741 100644 --- a/.github/ISSUE_TEMPLATE/documentation_change.md +++ b/.github/ISSUE_TEMPLATE/documentation_change.md @@ -1,5 +1,5 @@ --- -name: 📑 Documentation Change +name: 📑 Documentation change about: Submit a suggestion for the Wiki documentation title: '' labels: '' From 066b36b6a9d75d9dc15060b1329a3a617250d576 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 30 Mar 2019 14:56:19 -0400 Subject: [PATCH 10/13] make URL_BLACKLIST empty by default --- archivebox/config.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index ec970a22..0d49a5d2 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -47,7 +47,7 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget') YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') CHROME_BINARY = os.getenv('CHROME_BINARY', None) -URL_BLACKLIST = os.getenv('URL_BLACKLIST', '.*youtube.com.*,.*facebook.com/.*,.*.exe') +URL_BLACKLIST = os.getenv('URL_BLACKLIST', None) try: OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) @@ -76,6 +76,8 @@ USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode) +URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE) + ########################### Environment & Dependencies ######################### try: @@ -268,11 +270,3 @@ except KeyboardInterrupt: except: print('[X] There was an error during the startup procedure, your archive data is unaffected.') raise - -URL_BLACKLIST = re.compile( - r'(.*\.youtube\.com)|' - r'(.*\.facebook\.com)|' - r'(.*\.amazon\.com)|' - r'(.*\.reddit\.com)', - re.IGNORECASE, - ) \ No newline at end of file From 529a0f8bb2655128b03b568d2fe41f506645fb9d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 30 Mar 2019 15:00:21 -0400 Subject: [PATCH 11/13] fix broken function name --- archivebox/links.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/archivebox/links.py b/archivebox/links.py index 5eff61f4..0ec33fe5 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -37,7 +37,6 @@ def validate_links(links): links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links(links) # deterministically sort the links based on timstamp, url - links = list(exclude_links(links)) # exclude URLs that match the blacklisted url pattern regex if not links: print('[X] No links found :(') @@ -52,11 +51,11 @@ def validate_links(links): def archivable_links(links): """remove chrome://, about:// or other schemed links that cant be archived""" - return ( - link - for link in links - if any(link['url'].lower().startswith(s) for s in ('http://', 'https://', 'ftp://')) - ) + for link in links: + scheme_is_valid = scheme(url) in ('http', 'https', 'ftp) + not_blacklisted = (not URL_BLACKLIST.match(link['url'])) if URL_BLACKLIST else True + if scheme_is_valid and not_blacklisted: + yield link def uniquefied_links(sorted_links): @@ -119,9 +118,5 @@ def lowest_uniq_timestamp(used_timestamps, timestamp): new_timestamp = '{}.{}'.format(timestamp, nonce) return new_timestamp - -def exclude_blacklisted(links): - """exclude URLs that match the blacklisted url pattern regex""" - return (link for link in links if not URL_BLACKLIST.match(link['url'])) - \ No newline at end of file + From 1191cf1df1bb9ae8ddc9ef93eb45c9a78fec47bf Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 30 Mar 2019 15:04:43 -0400 Subject: [PATCH 12/13] fix importerror of scheme --- archivebox/links.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/archivebox/links.py b/archivebox/links.py index 0ec33fe5..c7747994 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -23,6 +23,7 @@ from html import unescape from collections import OrderedDict from util import ( + scheme, merge_links, check_link_structure, check_links_structure, @@ -52,7 +53,7 @@ def validate_links(links): def archivable_links(links): """remove chrome://, about:// or other schemed links that cant be archived""" for link in links: - scheme_is_valid = scheme(url) in ('http', 'https', 'ftp) + scheme_is_valid = scheme(link['url']) in ('http', 'https', 'ftp') not_blacklisted = (not URL_BLACKLIST.match(link['url'])) if URL_BLACKLIST else True if scheme_is_valid and not_blacklisted: yield link From 19a7f12a00567bcd06543f908df9ad0d28d68163 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 30 Mar 2019 15:30:21 -0400 Subject: [PATCH 13/13] Update PULL_REQUEST_TEMPLATE.md --- .github/PULL_REQUEST_TEMPLATE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 44e56a16..c903d1a9 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,5 @@ +**IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes, I will close them with great prejudice. The PEP8 checks I don't follow are intentional. PRs for minor bugfixes, typos, etc are fine.** + # Summary e.g. This PR fixes ABC or adds the ability to do XYZ...