From f78838ef40d6a0c2779de200ed22b1e9ed0bb630 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 10 Jun 2018 18:45:41 -0400 Subject: [PATCH] add DOM archiving with chrome headless --- archive_methods.py | 44 ++++++++++++++++++++++++++++++++++++++-- config.py | 1 + links.py | 3 +++ templates/index.html | 1 + templates/index_row.html | 13 ++++++------ util.py | 7 +++++-- 6 files changed, 59 insertions(+), 10 deletions(-) diff --git a/archive_methods.py b/archive_methods.py index c5e77660..cb4e19da 100644 --- a/archive_methods.py +++ b/archive_methods.py @@ -17,6 +17,7 @@ from config import ( FETCH_WGET_REQUISITES, FETCH_PDF, FETCH_SCREENSHOT, + FETCH_DOM, RESOLUTION, CHECK_SSL_VALIDITY, SUBMIT_ARCHIVE_DOT_ORG, @@ -93,6 +94,9 @@ def archive_link(link_dir, link, overwrite=True): if FETCH_SCREENSHOT: link = fetch_screenshot(link_dir, link, overwrite=overwrite) + if FETCH_DOM: + link = fetch_dom(link_dir, link, overwrite=overwrite) + if SUBMIT_ARCHIVE_DOT_ORG: link = archive_dot_org(link_dir, link, overwrite=overwrite) @@ -252,7 +256,6 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI 'output': output, } - @attach_result_to_link('screenshot') def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION): """take screenshot of site using chrome --headless""" @@ -289,6 +292,43 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_ 'output': output, } +@attach_result_to_link('dom') +def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR): + """print HTML of site to file using chrome --dump-html""" + + if link['type'] in ('PDF', 'image'): + return {'output': wget_output_path(link)} + + output_path = os.path.join(link_dir, 'output.html') + + if os.path.exists(output_path): + return {'output': 'output.html', 'status': 'skipped'} + + CMD = [ + *chrome_headless(user_data_dir=user_data_dir), + '--dump-dom', + link['url'] + ] + end = progress(timeout, prefix=' ') + try: + with open(output_path, 'w+') as f: + result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.html + end() + if result.returncode: + print(' ', (result.stderr).decode()) + raise Exception('Failed to fetch DOM') + chmod_file('output.html', cwd=link_dir) + output = 'output.html' + except Exception as e: + end() + print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) + output = e + + return { + 'cmd': CMD, + 'output': output, + } @attach_result_to_link('archive_org') def archive_dot_org(link_dir, link, timeout=TIMEOUT): @@ -445,7 +485,7 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT): def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR): - args = [binary, '--headless', '--disable-gpu'] + args = [binary, '--headless'] # '--disable-gpu' default_profile = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default') if user_data_dir: args.append('--user-data-dir={}'.format(user_data_dir)) diff --git a/config.py b/config.py index 6f85ed09..ceae6c75 100644 --- a/config.py +++ b/config.py @@ -19,6 +19,7 @@ FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' FETCH_VIDEO = os.getenv('FETCH_VIDEO', 'False' ).lower() == 'true' FETCH_PDF = os.getenv('FETCH_PDF', 'True' ).lower() == 'true' FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true' +FETCH_DOM = os.getenv('FETCH_DOM', 'True' ).lower() == 'true' FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true' SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true' RESOLUTION = os.getenv('RESOLUTION', '1440,1200' ) diff --git a/links.py b/links.py index 61d968e9..b3fca5d4 100644 --- a/links.py +++ b/links.py @@ -68,6 +68,9 @@ def validate_links(links): if not link['latest'].get('screenshot'): link['latest']['screenshot'] = None + if not link['latest'].get('dom'): + link['latest']['dom'] = None + return list(links) diff --git a/templates/index.html b/templates/index.html index e3037495..22cb888b 100644 --- a/templates/index.html +++ b/templates/index.html @@ -113,6 +113,7 @@ Status Saved Articles ($num_links) Index + HTML PDF Screenshot A.org diff --git a/templates/index_row.html b/templates/index_row.html index 1f5ca069..d3de91cc 100644 --- a/templates/index_row.html +++ b/templates/index_row.html @@ -5,12 +5,13 @@ - + $title $tags - 📂 - 📄 - 🖼 - 🏛 - 🔗 $url + 📂 + 📄 + 📜 + 🖼 + 🏛 + $url diff --git a/util.py b/util.py index 4b1a6b29..d18f2159 100644 --- a/util.py +++ b/util.py @@ -23,6 +23,7 @@ from config import ( FETCH_WGET, FETCH_PDF, FETCH_SCREENSHOT, + FETCH_DOM, FETCH_FAVICON, FETCH_AUDIO, FETCH_VIDEO, @@ -49,7 +50,7 @@ def check_dependencies(): print(' See https://github.com/pirate/bookmark-archiver#troubleshooting for help upgrading your Python installation.') raise SystemExit(1) - if FETCH_PDF or FETCH_SCREENSHOT: + if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM: if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode: print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY)) @@ -64,7 +65,7 @@ def check_dependencies(): version = [l for l in version_lines if l.isdigit()][-1] if int(version) < 59: print(version_lines) - print('{red}[X] Chrome version must be 59 or greater for headless PDF and screenshot saving{reset}'.format(**ANSI)) + print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI)) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) except (IndexError, TypeError, OSError): @@ -459,6 +460,7 @@ def derived_link_info(link): 'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link)), 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link), 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link), + 'dom_link': 'archive/{timestamp}/output.html'.format(**link), 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link), } @@ -469,6 +471,7 @@ def derived_link_info(link): 'archive_url': 'archive/{timestamp}/{base_url}'.format(**link), 'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link), 'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link), + 'dom_link': 'archive/{timestamp}/{base_url}'.format(**link), 'title': '{title} ({type})'.format(**link), }) return link_info