diff --git a/archivebox/config.py b/archivebox/config.py index 91871a94..47049342 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -159,6 +159,9 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'NODE_BINARY': {'type': str, 'default': 'node'}, 'CHROME_BINARY': {'type': str, 'default': None}, + + 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, + 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, }, } @@ -386,7 +389,7 @@ def load_config_val(key: str, raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)') return int(val) - elif type is list: + elif type is list or type is dict: return json.loads(val) raise Exception('Config values can only be str, bool, int or json') diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 42b2464e..441c08ac 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -32,6 +32,7 @@ from ..index.schema import Link from ..logging_util import TimedProgress, log_source_saved from .pocket_html import parse_pocket_html_export +from .pocket_api import parse_pocket_api_export from .pinboard_rss import parse_pinboard_rss_export from .wallabag_atom import parse_wallabag_atom_export from .shaarli_rss import parse_shaarli_rss_export @@ -44,6 +45,7 @@ from .generic_txt import parse_generic_txt_export PARSERS = ( # Specialized parsers + ('Pocket API', parse_pocket_api_export), ('Wallabag ATOM', parse_wallabag_atom_export), ('Pocket HTML', parse_pocket_html_export), ('Pinboard RSS', parse_pinboard_rss_export), diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py new file mode 100644 index 00000000..5327eebb --- /dev/null +++ b/archivebox/parsers/pocket_api.py @@ -0,0 +1,115 @@ +__package__ = 'archivebox.parsers' + + +import re + +from typing import IO, Iterable, Optional +from datetime import datetime +from configparser import ConfigParser + +from pathlib import Path +from pocket import Pocket +import requests + +from ..index.schema import Link +from ..util import ( + enforce_types, +) +from ..config import ( + SOURCES_DIR +) + +_COUNT_PER_PAGE = 500 +_API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db' + +# search for broken protocols that sometimes come from the Pocket API +_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))') + +def get_pocket_articles(api: Pocket, since=None, page=0): + body, headers = api.get( + state='archive', + sort='oldest', + since=since, + count=_COUNT_PER_PAGE, + offset=page * _COUNT_PER_PAGE, + ) + + articles = body['list'].values() if isinstance(body['list'], dict) else body['list'] + returned_count = len(articles) + + yield from articles + + if returned_count == _COUNT_PER_PAGE: + yield from get_pocket_articles(api, since=since, page=page + 1) + else: + api.last_since = body['since'] + + +def link_from_article(article: dict, sources: list): + url: str = article['resolved_url'] or article['given_url'] + broken_protocol = _BROKEN_PROTOCOL_RE.match(url) + if broken_protocol: + url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://') + title = article['resolved_title'] or article['given_title'] or url + + return Link( + url=url, + timestamp=article['time_read'], + title=title, + tags=article.get('tags'), + sources=sources + ) + +def write_since(username: str, since: str): + from ..system import atomic_write + + if not _API_DB_PATH.exists(): + atomic_write(_API_DB_PATH, '') + + since_file = ConfigParser() + since_file.optionxform = str + since_file.read(_API_DB_PATH) + + since_file[username] = { + 'since': since + } + + with open(_API_DB_PATH, 'w+') as new: + since_file.write(new) + +def read_since(username: str) -> Optional[str]: + from ..system import atomic_write + + if not _API_DB_PATH.exists(): + atomic_write(_API_DB_PATH, '') + + config_file = ConfigParser() + config_file.optionxform = str + config_file.read(_API_DB_PATH) + + return config_file.get(username, 'since', fallback=None) + +@enforce_types +def should_parse_as_pocket_api(text: str) -> bool: + return text.startswith('pocket://') + +@enforce_types +def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]: + """Parse bookmarks from the Pocket API""" + + input_buffer.seek(0) + pattern = re.compile("^pocket:\/\/(\w+)") + for line in input_buffer: + if should_parse_as_pocket_api(line): + from ..config import ( + POCKET_CONSUMER_KEY, + POCKET_ACCESS_TOKENS, + ) + username = pattern.search(line).group(1) + api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username]) + api.last_since = None + + for article in get_pocket_articles(api, since=read_since(username)): + yield link_from_article(article, sources=[line]) + + write_since(username, api.last_since) diff --git a/setup.py b/setup.py index f65ead27..f81edf21 100755 --- a/setup.py +++ b/setup.py @@ -59,6 +59,7 @@ setuptools.setup( "python-crontab==2.5.1", "croniter==0.3.34", "w3lib==1.22.0", + "pocket==0.3.6", # Some/all of these will likely be added in the future: # wpull # pywb