ArchiveBox/archivebox/parsers/pocket_api.py
mAAdhaTTah ac7ad9e942
Add parser for Pocket API
Pass a url like `pocket://Username` to import that username's archived Pocket
library. Tokens need to be stored in ArchveBox.conf with the following keys:

```
POCKET_CONSUMER_KEY = key-from-custom-pocket-app
POCKET_ACCESS_TOKENS = {"YourUsername": "pocket-token-for-app"}
```

`POCKET_ACCESS_TOKENS` MUST be on a single line, or the JSON will be
misinterpreted by the parser as a new key/value pair.
2020-12-04 22:54:39 -05:00

115 lines
3.1 KiB
Python

__package__ = 'archivebox.parsers'
import re
from typing import IO, Iterable, Optional
from datetime import datetime
from configparser import ConfigParser
from pathlib import Path
from pocket import Pocket
import requests
from ..index.schema import Link
from ..util import (
enforce_types,
)
from ..config import (
SOURCES_DIR
)
_COUNT_PER_PAGE = 500
_API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
# search for broken protocols that sometimes come from the Pocket API
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
def get_pocket_articles(api: Pocket, since=None, page=0):
body, headers = api.get(
state='archive',
sort='oldest',
since=since,
count=_COUNT_PER_PAGE,
offset=page * _COUNT_PER_PAGE,
)
articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
returned_count = len(articles)
yield from articles
if returned_count == _COUNT_PER_PAGE:
yield from get_pocket_articles(api, since=since, page=page + 1)
else:
api.last_since = body['since']
def link_from_article(article: dict, sources: list):
url: str = article['resolved_url'] or article['given_url']
broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
if broken_protocol:
url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
title = article['resolved_title'] or article['given_title'] or url
return Link(
url=url,
timestamp=article['time_read'],
title=title,
tags=article.get('tags'),
sources=sources
)
def write_since(username: str, since: str):
from ..system import atomic_write
if not _API_DB_PATH.exists():
atomic_write(_API_DB_PATH, '')
since_file = ConfigParser()
since_file.optionxform = str
since_file.read(_API_DB_PATH)
since_file[username] = {
'since': since
}
with open(_API_DB_PATH, 'w+') as new:
since_file.write(new)
def read_since(username: str) -> Optional[str]:
from ..system import atomic_write
if not _API_DB_PATH.exists():
atomic_write(_API_DB_PATH, '')
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(_API_DB_PATH)
return config_file.get(username, 'since', fallback=None)
@enforce_types
def should_parse_as_pocket_api(text: str) -> bool:
return text.startswith('pocket://')
@enforce_types
def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse bookmarks from the Pocket API"""
input_buffer.seek(0)
pattern = re.compile("^pocket:\/\/(\w+)")
for line in input_buffer:
if should_parse_as_pocket_api(line):
from ..config import (
POCKET_CONSUMER_KEY,
POCKET_ACCESS_TOKENS,
)
username = pattern.search(line).group(1)
api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
api.last_since = None
for article in get_pocket_articles(api, since=read_since(username)):
yield link_from_article(article, sources=[line])
write_since(username, api.last_since)