diff --git a/README.md b/README.md
index 19196b4f..a83922a3 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ archivebox help
- `archivebox add/remove/update/list` to manage Snapshots in the archive
- `archivebox schedule` to pull in fresh URLs in regularly from [boorkmarks/history/Pocket/Pinboard/RSS/etc.](#input-formats)
- `archivebox oneshot` archive single URLs without starting a whole collection
-- `archivebox shell` open a REPL to use the [Python API](https://docs.archivebox.io/en/latest/modules.html) (alpha)
+- `archivebox shell/manage dbshell` open a REPL to use the [Python API](https://docs.archivebox.io/en/latest/modules.html) (alpha), or SQL API
@@ -639,7 +639,7 @@ archivebox config --set DEBUG=True
archivebox server --debug ...
```
-### Build and run a Github branch
+#### Build and run a Github branch
```bash
docker build -t archivebox:dev https://github.com/ArchiveBox/ArchiveBox.git#dev
@@ -669,6 +669,7 @@ cd archivebox/
cd path/to/test/data/
archivebox shell
+archivebox manage dbshell
```
(uses `pytest -s`)
diff --git a/archivebox/config.py b/archivebox/config.py
index 7fd4b2fc..349817ec 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -27,6 +27,7 @@ import re
import sys
import json
import getpass
+import platform
import shutil
import django
@@ -51,7 +52,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SHELL_CONFIG': {
'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
- 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: c['IS_TTY']},
+ 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now
'IN_DOCKER': {'type': bool, 'default': False},
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
},
@@ -914,7 +915,12 @@ os.umask(0o777 - int(OUTPUT_PERMISSIONS, base=8)) # noqa: F821
NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
sys.path.append(NODE_BIN_PATH)
-
+# disable stderr "you really shouldnt disable ssl" warnings with library config
+if not CONFIG['CHECK_SSL_VALIDITY']:
+ import urllib3
+ import requests
+ requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
########################### Config Validity Checkers ###########################
diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py
index c9ba2187..44c08428 100644
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -99,13 +99,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
list_display = ('added', 'title_str', 'url_str', 'files', 'size')
sort_fields = ('title_str', 'url_str', 'added')
readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
- search_fields = ['url', 'timestamp', 'title', 'tags__name']
+ search_fields = ['url__icontains', 'timestamp', 'title', 'tags__name']
fields = (*readonly_fields, 'title', 'tags')
list_filter = ('added', 'updated', 'tags')
ordering = ['-added']
actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
actions_template = 'admin/actions_as_select.html'
form = SnapshotAdminForm
+ list_per_page = 40
def get_urls(self):
urls = super().get_urls()
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index bcf9c073..e73c93d9 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -33,6 +33,8 @@ LOGOUT_REDIRECT_URL = '/'
PASSWORD_RESET_URL = '/accounts/password_reset/'
APPEND_SLASH = True
+DEBUG = DEBUG or ('--debug' in sys.argv)
+
INSTALLED_APPS = [
'django.contrib.auth',
'django.contrib.contenttypes',
diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py
index 3023382f..67b61360 100644
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@@ -47,14 +47,13 @@ def save_favicon(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIM
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
'https://www.google.com/s2/favicons?domain={}'.format(domain(snapshot.url)),
]
- status = 'pending'
+ status = 'failed'
timer = TimedProgress(timeout, prefix=' ')
try:
run(cmd, cwd=str(out_dir), timeout=timeout)
chmod_file(output, cwd=str(out_dir))
status = 'succeeded'
except Exception as err:
- status = 'failed'
output = err
finally:
timer.end()
diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py
index df76ab5e..943ea979 100644
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@@ -42,7 +42,7 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
"""download full site using single-file"""
out_dir = out_dir or Path(snapshot.snapshot_dir)
- output = str(out_dir.absolute() / "singlefile.html")
+ output = "singlefile.html"
browser_args = chrome_args(TIMEOUT=0)
@@ -54,6 +54,7 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
browser_args,
snapshot.url,
output
+ output,
]
status = 'succeeded'
@@ -74,9 +75,9 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
)
# Check for common failure cases
- if (result.returncode > 0):
+ if (result.returncode > 0) or not (out_dir / output).is_file():
raise ArchiveError('SingleFile was not able to archive the page', hints)
- chmod_file(output)
+ chmod_file(output, cwd=str(out_dir))
except (Exception, OSError) as err:
status = 'failed'
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index fa082b43..a12a1d54 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -10,7 +10,6 @@ from django.db.models import Model
from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
from ..util import (
enforce_types,
- is_static_file,
download_url,
htmldecode,
)
@@ -65,11 +64,8 @@ class TitleParser(HTMLParser):
# output = '{title}'
@enforce_types
-def should_save_title(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
- if is_static_file(snapshot.url):
- False
-
- # if snapshot already has valid title, skip it
+def should_save_title(snapshot: Model, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
+ # if link already has valid title, skip it
if not overwrite and snapshot.title and not snapshot.title.lower().startswith('http'):
return False
@@ -118,7 +114,11 @@ def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEO
.update(title=output)
snapshot.title = output
else:
- raise ArchiveError('Unable to detect page title')
+ # if no content was returned, dont save a title (because it might be a temporary error)
+ if not html:
+ raise ArchiveError('Unable to detect page title')
+ # output = html[:128] # use first bit of content as the title
+ output = link.base_url # use the filename as the title (better UX)
except Exception as err:
status = 'failed'
output = err
diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index 6e83c12f..11e767a5 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -12,8 +12,6 @@ from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
- is_static_file,
- without_scheme,
without_fragment,
without_query,
path,
@@ -107,7 +105,12 @@ def save_wget(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOU
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Wget failed or got an error from the server', hints)
- chmod_file(output, cwd=str(out_dir))
+
+ if (out_dir / output).exists():
+ chmod_file(output, cwd=str(out_dir))
+ else:
+ print(f' {out_dir}/{output}')
+ raise ArchiveError('Failed to find wget output after running', hints)
except Exception as err:
status = 'failed'
output = err
@@ -131,8 +134,6 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
See docs on wget --adjust-extension (-E)
"""
- if is_static_file(snapshot.url):
- return without_scheme(without_fragment(snapshot.url))
# Wget downloads can save in a number of different ways depending on the url:
# https://example.com
@@ -184,7 +185,7 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
for file_present in search_dir.iterdir():
if file_present == last_part_of_url:
- return str(search_dir / file_present)
+ return str((search_dir / file_present).relative_to(snapshot.snapshot_dir))
# Move up one directory level
search_dir = search_dir.parent
@@ -192,10 +193,15 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
if search_dir == snapshot.snapshot_dir:
break
-
+ # check for literally any file present that isnt an empty folder
+ domain_dir = Path(domain(snapshot.url).replace(":", "+"))
+ files_within = list((Path(snapshot.snapshot_dir) / domain_dir).glob('**/*.*'))
+ if files_within:
+ return str((domain_dir / files_within[-1]).relative_to(snapshot.snapshot_dir))
- search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+") / urldecode(full_path)
- if not search_dir.is_dir():
- return str(search_dir.relative_to(snapshot.snapshot_dir))
+ # fallback to just the domain dir, dont try to introspect further inside it
+ search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+")
+ if search_dir.is_dir():
+ return domain(snapshot.url).replace(":", "+")
return None
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 8480d03f..4bae1fbf 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -2,7 +2,6 @@ __package__ = 'archivebox.index'
import os
import shutil
-import json as pyjson
from pathlib import Path
from itertools import chain
@@ -42,6 +41,7 @@ from .html import (
write_html_snapshot_details,
)
from .json import (
+ pyjson,
load_json_snapshot,
write_json_snapshot_details,
)
@@ -320,7 +320,7 @@ def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model
"""
out_dir = out_dir or Path(snapshot.snapshot_dir)
- existing_snapshot = load_json_snapshot_details(Path(out_dir))
+ existing_snapshot = load_json_snapshot(Path(out_dir))
if existing_snapshot:
return merge_snapshots(existing_snapshot, snapshot)
diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index 5fabcf89..39762722 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -24,6 +24,7 @@ from ..config import (
GIT_SHA,
FOOTER_INFO,
HTML_INDEX_FILENAME,
+ SAVE_ARCHIVE_DOT_ORG,
)
MAIN_INDEX_TEMPLATE = 'static_index.html'
@@ -97,11 +98,12 @@ def snapshot_details_template(snapshot: Model) -> str:
or (snapshot.domain if snapshot.is_archived else '')
) or 'about:blank',
'extension': snapshot.extension or 'html',
- 'tags': snapshot.tags_str() or "untagged",
+ 'tags': snapshot.tags_str() or 'untagged',
'size': printable_filesize(snapshot.archive_size) if snapshot.archive_size else 'pending',
'status': 'archived' if snapshot.is_archived else 'not yet archived',
'status_color': 'success' if snapshot.is_archived else 'danger',
'oldest_archive_date': ts_to_date(snapshot.oldest_archive_date),
+ 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
})
@enforce_types
@@ -115,6 +117,8 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
def snapshot_icons(snapshot) -> str:
from core.models import EXTRACTORS
+ # start = datetime.now()
+
archive_results = snapshot.archiveresult_set.filter(status="succeeded")
path = snapshot.archive_path
canon = snapshot.canonical_outputs()
@@ -136,33 +140,45 @@ def snapshot_icons(snapshot) -> str:
exclude = ["favicon", "title", "headers", "archive_org"]
# Missing specific entry for WARC
- extractor_items = defaultdict(lambda: None)
+ extractor_outputs = defaultdict(lambda: None)
for extractor, _ in EXTRACTORS:
for result in archive_results:
- if result.extractor == extractor:
- extractor_items[extractor] = result
+ if result.extractor == extractor and result:
+ extractor_outputs[extractor] = result
for extractor, _ in EXTRACTORS:
if extractor not in exclude:
- exists = False
- if extractor_items[extractor] is not None:
- outpath = (Path(path) / canon[f"{extractor}_path"])
- if outpath.is_dir():
- exists = any(outpath.glob('*.*'))
- elif outpath.is_file():
- exists = outpath.stat().st_size > 100
- output += format_html(output_template, path, canon[f"{extractor}_path"], str(exists),
+ existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
+ # if existing:
+ # existing = (Path(path) / existing)
+ # if existing.is_file():
+ # existing = True
+ # elif existing.is_dir():
+ # existing = any(existing.glob('*.*'))
+ output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
extractor, icons.get(extractor, "?"))
if extractor == "wget":
# warc isn't technically it's own extractor, so we have to add it after wget
- exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
- output += format_html(output_template, exists[0] if exists else '#', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
+
+ # get from db (faster but less thurthful)
+ exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # get from filesystem (slower but more accurate)
+ # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
+ output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
if extractor == "archive_org":
# The check for archive_org is different, so it has to be handled separately
- target_path = Path(path) / "archive.org.txt"
- exists = target_path.exists()
+
+ # get from db (faster)
+ exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # get from filesystem (slower)
+ # target_path = Path(path) / "archive.org.txt"
+ # exists = target_path.exists()
output += '
{} '.format(canon["archive_org_path"], str(exists),
"archive_org", icons.get("archive_org", "?"))
- return format_html('
{}', mark_safe(output))
+ result = format_html('{}', mark_safe(output))
+ # end = datetime.now()
+ # print(((end - start).total_seconds()*1000) // 1, 'ms')
+ return result
diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py
index 52bde00a..f243a7a8 100644
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -413,6 +413,8 @@ class Link:
"""predict the expected output paths that should be present after archiving"""
from ..extractors.wget import wget_output_path
+ # TODO: banish this awful duplication from the codebase and import these
+ # from their respective extractor files
canonical = {
'index_path': 'index.html',
'favicon_path': 'favicon.ico',
@@ -428,6 +430,7 @@ class Link:
'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
'git_path': 'git/',
'media_path': 'media/',
+ 'headers_path': 'headers.json',
}
if self.is_static:
# static binary files like PDF and images are handled slightly differently.
diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py
index 48a8aac7..f2738fd6 100644
--- a/archivebox/parsers/generic_txt.py
+++ b/archivebox/parsers/generic_txt.py
@@ -54,9 +54,9 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Model]:
# look inside the URL for any sub-urls, e.g. for archive.org links
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
- for url in re.findall(URL_REGEX, line[1:]):
+ for sub_url in re.findall(URL_REGEX, line[1:]):
yield Snapshot(
- url=htmldecode(url),
+ url=htmldecode(sub_url),
timestamp=str(datetime.now().timestamp()),
title=None,
#tags=None,
diff --git a/archivebox/parsers/wallabag_atom.py b/archivebox/parsers/wallabag_atom.py
index c3e6971d..b20b31e7 100644
--- a/archivebox/parsers/wallabag_atom.py
+++ b/archivebox/parsers/wallabag_atom.py
@@ -47,7 +47,7 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
try:
tags = str_between(get_row('category'), 'label="', '" />')
- except:
+ except Exception:
tags = None
yield Snapshot(
diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py
index 55c97e75..e6d15455 100644
--- a/archivebox/search/utils.py
+++ b/archivebox/search/utils.py
@@ -34,10 +34,11 @@ def get_indexable_content(results: QuerySet):
return []
# This should come from a plugin interface
+ # TODO: banish this duplication and get these from the extractor file
if method == 'readability':
return get_file_result_content(res, 'content.txt')
elif method == 'singlefile':
- return get_file_result_content(res, '')
+ return get_file_result_content(res,'',use_pwd=True)
elif method == 'dom':
return get_file_result_content(res,'',use_pwd=True)
elif method == 'wget':
diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html
index 7c513d12..2ffe031d 100644
--- a/archivebox/templates/core/snapshot.html
+++ b/archivebox/templates/core/snapshot.html
@@ -33,7 +33,7 @@
}
.nav > div {
min-height: 30px;
- margin: 8px 0px;
+ line-height: 1.3;
}
.header-top a {
text-decoration: none;
@@ -68,6 +68,11 @@
vertical-align: -2px;
margin-right: 4px;
}
+ .header-toggle {
+ line-height: 14px;
+ font-size: 70px;
+ vertical-align: -8px;
+ }
.info-row {
margin-top: 2px;
@@ -76,24 +81,30 @@
.info-row .alert {
margin-bottom: 0px;
}
- .card {
+ .header-bottom-frames .card {
overflow: hidden;
box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
margin-top: 10px;
+ border: 1px solid rgba(0,0,0,3);
+ border-radius: 14px;
+ background-color: black;
}
.card h4 {
font-size: 1.4vw;
}
.card-body {
- font-size: 1vw;
- padding-top: 1.2vw;
- padding-left: 1vw;
- padding-right: 1vw;
- padding-bottom: 1vw;
+ font-size: 15px;
+ padding: 13px 10px;
+ padding-bottom: 6px;
+ /* padding-left: 3px; */
+ /* padding-right: 3px; */
+ /* padding-bottom: 3px; */
line-height: 1.1;
word-wrap: break-word;
max-height: 102px;
overflow: hidden;
+ background-color: #1a1a1a;
+ color: #d3d3d3;
}
.card-title {
margin-bottom: 4px;
@@ -126,7 +137,7 @@
border-top: 3px solid #aa1e55;
}
.card.selected-card {
- border: 2px solid orange;
+ border: 1px solid orange;
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
}
.iframe-large {
@@ -174,12 +185,13 @@
width: 98%;
border: 1px solid rgba(0,0,0,0.2);
box-shadow: 4px 4px 4px rgba(0,0,0,0.2);
- margin-top: 5px;
+ margin-top: 0px;
}
.header-bottom-info {
color: #6f6f6f;
- padding-top: 8px;
- padding-bottom: 13px;
+ padding-top: 0px;
+ padding-bottom: 0px;
+ margin: 0px -15px;
}
.header-bottom-info > div {
@@ -203,12 +215,30 @@
margin-top: 5px;
}
.header-bottom-frames .card-title {
- padding-bottom: 0px;
- font-size: 1.2vw;
+ width: 100%;
+ text-align: center;
+ font-size: 18px;
margin-bottom: 5px;
+ display: inline-block;
+ color: #d3d3d3;
+ font-weight: 200;
+ vertical-align: 0px;
+ margin-top: -6px;
}
.header-bottom-frames .card-text {
+ width: 100%;
+ text-align: center;
font-size: 0.9em;
+ display: inline-block;
+ position: relative;
+ top: -11px;
+ }
+ .card-text code {
+ padding: .2rem .4rem;
+ font-size: 90%;
+ color: #bd4147;
+ background-color: #101010;
+ border-radius: .25rem;
}
@media(max-width: 1092px) {
@@ -247,7 +277,7 @@