show full version info using flag

This commit is contained in:
Nick Sweeting 2019-03-27 16:44:00 -04:00
parent 3375522ff4
commit a26c2fe467
4 changed files with 114 additions and 94 deletions

View file

@ -12,7 +12,7 @@ __package__ = 'archivebox'
import os import os
import sys import sys
import shutil
from typing import List, Optional from typing import List, Optional
@ -23,8 +23,23 @@ from .archive_methods import archive_link
from .config import ( from .config import (
ONLY_NEW, ONLY_NEW,
OUTPUT_DIR, OUTPUT_DIR,
PYTHON_DIR,
VERSION, VERSION,
ANSI,
CURL_VERSION,
GIT_VERSION,
WGET_VERSION,
YOUTUBEDL_VERSION,
CHROME_VERSION,
USE_CURL,
USE_WGET,
USE_CHROME,
CURL_BINARY,
GIT_BINARY,
WGET_BINARY,
YOUTUBEDL_BINARY,
CHROME_BINARY,
FETCH_GIT,
FETCH_MEDIA,
) )
from .util import ( from .util import (
enforce_types, enforce_types,
@ -59,8 +74,37 @@ def print_help():
print(" archivebox add --depth=1 https://example.com/feed.rss") print(" archivebox add --depth=1 https://example.com/feed.rss")
print(" archivebox update --resume=15109948213.123") print(" archivebox update --resume=15109948213.123")
def print_version():
print('ArchiveBox v{}'.format(__VERSION__))
print()
print(
'[{}] CURL:'.format('' if USE_CURL else 'X').ljust(14),
'{} --version\n'.format(shutil.which(CURL_BINARY)),
' '*13, CURL_VERSION, '\n',
)
print(
'[{}] GIT:'.format('' if FETCH_GIT else 'X').ljust(14),
'{} --version\n'.format(shutil.which(GIT_BINARY)),
' '*13, GIT_VERSION, '\n',
)
print(
'[{}] WGET:'.format('' if USE_WGET else 'X').ljust(14),
'{} --version\n'.format(shutil.which(WGET_BINARY)),
' '*13, WGET_VERSION, '\n',
)
print(
'[{}] YOUTUBEDL:'.format('' if FETCH_MEDIA else 'X').ljust(14),
'{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)),
' '*13, YOUTUBEDL_VERSION, '\n',
)
print(
'[{}] CHROME:'.format('' if USE_CHROME else 'X').ljust(14),
'{} --version\n'.format(shutil.which(CHROME_BINARY)),
' '*13, CHROME_VERSION, '\n',
)
def main(args=None) -> List[Link]:
def main(args=None) -> None:
if args is None: if args is None:
args = sys.argv args = sys.argv
@ -69,7 +113,7 @@ def main(args=None) -> List[Link]:
raise SystemExit(0) raise SystemExit(0)
if set(args).intersection(('--version', 'version')): if set(args).intersection(('--version', 'version')):
print('ArchiveBox version {}'.format(__VERSION__)) print_version()
raise SystemExit(0) raise SystemExit(0)
### Handle CLI arguments ### Handle CLI arguments
@ -86,7 +130,19 @@ def main(args=None) -> List[Link]:
### Set up output folder ### Set up output folder
if not os.path.exists(OUTPUT_DIR): if not os.path.exists(OUTPUT_DIR):
print('{green}[+] Created a new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
os.makedirs(OUTPUT_DIR) os.makedirs(OUTPUT_DIR)
else:
not_empty = len(set(os.listdir(OUTPUT_DIR)) - {'.DS_Store'})
index_exists = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
if not_empty and not index_exists:
print(
('{red}[X] Could not find index.json in the OUTPUT_DIR: {reset}{}\n'
' You must run ArchiveBox in an existing archive directory, \n'
' or an empty/new directory to start a new archive collection.'
).format(OUTPUT_DIR, **ANSI)
)
raise SystemExit(1)
### Handle ingesting urls piped in through stdin ### Handle ingesting urls piped in through stdin
# (.e.g if user does cat example_urls.txt | ./archive) # (.e.g if user does cat example_urls.txt | ./archive)

View file

@ -4,7 +4,7 @@ from typing import Dict, List, Tuple
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from .schema import Link, ArchiveResult, ArchiveError from .schema import Link, ArchiveResult
from .index import ( from .index import (
write_link_index, write_link_index,
patch_links_index, patch_links_index,
@ -28,8 +28,6 @@ from .config import (
SUBMIT_ARCHIVE_DOT_ORG, SUBMIT_ARCHIVE_DOT_ORG,
TIMEOUT, TIMEOUT,
MEDIA_TIMEOUT, MEDIA_TIMEOUT,
ANSI,
OUTPUT_DIR,
GIT_DOMAINS, GIT_DOMAINS,
VERSION, VERSION,
WGET_USER_AGENT, WGET_USER_AGENT,
@ -40,7 +38,6 @@ from .config import (
CHROME_VERSION, CHROME_VERSION,
GIT_VERSION, GIT_VERSION,
YOUTUBEDL_VERSION, YOUTUBEDL_VERSION,
ONLY_NEW,
WGET_AUTO_COMPRESSION, WGET_AUTO_COMPRESSION,
) )
from .util import ( from .util import (
@ -56,7 +53,6 @@ from .util import (
wget_output_path, wget_output_path,
chrome_args, chrome_args,
run, PIPE, DEVNULL, run, PIPE, DEVNULL,
Link,
) )
from .logs import ( from .logs import (
log_link_archiving_started, log_link_archiving_started,
@ -66,6 +62,12 @@ from .logs import (
) )
class ArchiveError(Exception):
def __init__(self, message, hints=None):
super().__init__(message)
self.hints = hints
@enforce_types @enforce_types
def archive_link(link: Link, page=None) -> Link: def archive_link(link: Link, page=None) -> Link:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""

View file

@ -59,8 +59,24 @@ CHROME_BINARY = os.getenv('CHROME_BINARY', None)
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
# ****************************************************************************** # ******************************************************************************
# *************************** Directory Settings *******************************
# ****************************************************************************** ### Terminal Configuration
TERM_WIDTH = lambda: shutil.get_terminal_size((100, 10)).columns
ANSI = {
'reset': '\033[00;00m',
'lightblue': '\033[01;30m',
'lightyellow': '\033[01;33m',
'lightred': '\033[01;35m',
'red': '\033[01;31m',
'green': '\033[01;32m',
'blue': '\033[01;34m',
'white': '\033[01;37m',
'black': '\033[01;30m',
}
if not USE_COLOR:
# dont show colors if USE_COLOR is False
ANSI = {k: '' for k in ANSI.keys()}
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
if OUTPUT_DIR: if OUTPUT_DIR:
@ -68,21 +84,6 @@ if OUTPUT_DIR:
else: else:
OUTPUT_DIR = os.path.abspath(os.curdir) OUTPUT_DIR = os.path.abspath(os.curdir)
if not os.path.exists(OUTPUT_DIR):
print('{green}[+] Created a new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
os.makedirs(OUTPUT_DIR)
else:
not_empty = len(set(os.listdir(OUTPUT_DIR)) - {'.DS_Store'})
index_exists = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
if not_empty and not index_exists:
print(
('{red}[X] Could not find index.json in the OUTPUT_DIR: {reset}{}\n'
' You must run ArchiveBox in an existing archive directory, \n'
' or an empty/new directory to start a new archive collection.'
).format(OUTPUT_DIR, **ANSI)
)
raise SystemExit(1)
ARCHIVE_DIR_NAME = 'archive' ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources' SOURCES_DIR_NAME = 'sources'
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME) ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME)
@ -94,13 +95,34 @@ TEMPLATES_DIR = os.path.join(PYTHON_DIR, 'templates')
if COOKIES_FILE: if COOKIES_FILE:
COOKIES_FILE = os.path.abspath(COOKIES_FILE) COOKIES_FILE = os.path.abspath(COOKIES_FILE)
VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip()
GIT_SHA = VERSION.split('+')[1]
### Check Python environment
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
if python_vers < 3.5:
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
print('')
print(' Confirm that it\'s fixed by opening a new shell and running:')
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
print('')
print(' Alternatively, run this script with:')
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
# ****************************************************************************** # ******************************************************************************
# ***************************** Helper Functions ******************************* # ***************************** Helper Functions *******************************
# ****************************************************************************** # ******************************************************************************
def check_version(binary: str) -> str: def check_version(binary: str) -> str:
"""check the presence and return valid version line of a specified binary""" """check the presence and return valid version line of a specified binary"""
if run(['which', binary], stdout=DEVNULL, stderr=DEVNULL).returncode: if not shutil.which(binary):
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(binary)) print(' Install it, then confirm it works with: {} --version'.format(binary))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
@ -168,43 +190,6 @@ def find_chrome_data_dir() -> Optional[str]:
# ****************************************************************************** # ******************************************************************************
try: try:
VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip()
GIT_SHA = VERSION.split('+')[1]
### Terminal Configuration
TERM_WIDTH = lambda: shutil.get_terminal_size((100, 10)).columns
ANSI = {
'reset': '\033[00;00m',
'lightblue': '\033[01;30m',
'lightyellow': '\033[01;33m',
'lightred': '\033[01;35m',
'red': '\033[01;31m',
'green': '\033[01;32m',
'blue': '\033[01;34m',
'white': '\033[01;37m',
'black': '\033[01;30m',
}
if not USE_COLOR:
# dont show colors if USE_COLOR is False
ANSI = {k: '' for k in ANSI.keys()}
### Check Python environment
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
if python_vers < 3.5:
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
print('')
print(' Confirm that it\'s fixed by opening a new shell and running:')
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
print('')
print(' Alternatively, run this script with:')
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
### Make sure curl is installed ### Make sure curl is installed
if USE_CURL: if USE_CURL:
USE_CURL = FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG USE_CURL = FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG
@ -238,17 +223,18 @@ try:
### Make sure youtube-dl is installed ### Make sure youtube-dl is installed
YOUTUBEDL_VERSION = None YOUTUBEDL_VERSION = None
if FETCH_MEDIA: if FETCH_MEDIA:
check_version(YOUTUBEDL_BINARY) YOUTUBEDL_VERSION = check_version(YOUTUBEDL_BINARY)
### Make sure chrome is installed and calculate version ### Make sure chrome is installed and calculate version
if USE_CHROME: if USE_CHROME:
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
else: else:
FETCH_PDF = FETCH_SCREENSHOT = FETCH_DOM = False FETCH_PDF = FETCH_SCREENSHOT = FETCH_DOM = False
if CHROME_BINARY is None:
CHROME_BINARY = find_chrome_binary() or 'chromium-browser'
CHROME_VERSION = None CHROME_VERSION = None
if USE_CHROME: if USE_CHROME:
if CHROME_BINARY is None:
CHROME_BINARY = find_chrome_binary()
if CHROME_BINARY: if CHROME_BINARY:
CHROME_VERSION = check_version(CHROME_BINARY) CHROME_VERSION = check_version(CHROME_BINARY)
# print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) # print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))

View file

@ -1,24 +1,3 @@
"""
In ArchiveBox, a Link represents a single entry that we track in the
json index. All links pass through all archiver functions and the latest,
most up-to-date canonical output for each is stored in "latest".
Link {
timestamp: str, (how we uniquely id links)
url: str,
title: str,
tags: str,
sources: [str],
history: {
pdf: [
{start_ts, end_ts, cmd, pwd, cmd_version, status, output},
...
],
...
},
}
"""
from typing import Iterable from typing import Iterable
from collections import OrderedDict from collections import OrderedDict
@ -27,8 +6,6 @@ from .util import (
scheme, scheme,
fuzzy_url, fuzzy_url,
merge_links, merge_links,
htmldecode,
hashurl,
) )
@ -68,10 +45,9 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
unique_timestamps: OrderedDict[str, Link] = OrderedDict() unique_timestamps: OrderedDict[str, Link] = OrderedDict()
for link in unique_urls.values(): for link in unique_urls.values():
new_link = Link(**{ new_link = link.overwrite(
**link._asdict(), timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
'timestamp': lowest_uniq_timestamp(unique_timestamps, link.timestamp), )
})
unique_timestamps[new_link.timestamp] = new_link unique_timestamps[new_link.timestamp] = new_link
return unique_timestamps.values() return unique_timestamps.values()