From 46e80dd50933e563712e9ce90fc536f02b3c983c Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Sun, 30 Jul 2023 23:43:04 -0400 Subject: [PATCH] Rename URL_(WHITE|BLACK)LIST to URL_(ALLOW|DENY)LIST Retain aliases for old configuration files --- archivebox/config.py | 8 ++++---- archivebox/config_stubs.py | 2 +- archivebox/core/forms.py | 2 +- archivebox/index/__init__.py | 8 ++++---- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 739d7f12..f5eef758 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -82,8 +82,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, 'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'}, 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, - 'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages - 'URL_WHITELIST': {'type': str, 'default': None}, + 'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages + 'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)}, 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True}, 'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'}, }, @@ -371,8 +371,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None - 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, - 'URL_WHITELIST_PTN': {'default': lambda c: c['URL_WHITELIST'] and re.compile(c['URL_WHITELIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, + 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, + 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')}, diff --git a/archivebox/config_stubs.py b/archivebox/config_stubs.py index 2c42e808..c8cc9ecb 100644 --- a/archivebox/config_stubs.py +++ b/archivebox/config_stubs.py @@ -41,7 +41,7 @@ class ConfigDict(BaseConfig, total=False): MEDIA_TIMEOUT: int OUTPUT_PERMISSIONS: str RESTRICT_FILE_NAMES: str - URL_BLACKLIST: str + URL_DENYLIST: str SECRET_KEY: Optional[str] BIND_ADDR: str diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 99f4d02e..193c0d05 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -41,7 +41,7 @@ class AddLinkForm(forms.Form): # label="Exclude patterns", # min_length='1', # required=False, - # initial=URL_BLACKLIST, + # initial=URL_DENYLIST, # ) # timeout = forms.IntegerField( # initial=TIMEOUT, diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index f631430c..b9d57aeb 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -22,8 +22,8 @@ from ..config import ( JSON_INDEX_FILENAME, OUTPUT_DIR, TIMEOUT, - URL_BLACKLIST_PTN, - URL_WHITELIST_PTN, + URL_DENYLIST_PTN, + URL_ALLOWLIST_PTN, stderr, OUTPUT_PERMISSIONS ) @@ -142,9 +142,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]: continue if scheme(link.url) not in ('http', 'https', 'ftp'): continue - if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url): + if URL_DENYLIST_PTN and URL_DENYLIST_PTN.search(link.url): continue - if URL_WHITELIST_PTN and (not URL_WHITELIST_PTN.search(link.url)): + if URL_ALLOWLIST_PTN and (not URL_ALLOWLIST_PTN.search(link.url)): continue yield link