add extractors arg to oneshot command and bump version to v0.5.1

This commit is contained in:
Nick Sweeting 2020-12-11 15:48:46 +02:00
parent a194bb6301
commit 9fa70b3452
4 changed files with 15 additions and 6 deletions

View file

@ -89,8 +89,8 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
index_only=command.index_only, index_only=command.index_only,
overwrite=command.overwrite, overwrite=command.overwrite,
init=command.init, init=command.init,
out_dir=pwd or OUTPUT_DIR,
extractors=command.extract, extractors=command.extract,
out_dir=pwd or OUTPUT_DIR,
) )

View file

@ -36,6 +36,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
' ~/Desktop/sites_list.csv\n' ' ~/Desktop/sites_list.csv\n'
) )
) )
parser.add_argument(
"--extract",
type=str,
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
This does not take precedence over the configuration",
default=""
)
parser.add_argument( parser.add_argument(
'--out-dir', '--out-dir',
type=str, type=str,
@ -55,6 +62,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
oneshot( oneshot(
url=stdin_url or url, url=stdin_url or url,
out_dir=Path(command.out_dir).resolve(), out_dir=Path(command.out_dir).resolve(),
extractors=command.extract,
) )

View file

@ -511,7 +511,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
@enforce_types @enforce_types
def oneshot(url: str, out_dir: Path=OUTPUT_DIR): def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
""" """
Create a single URL archive folder with an index.json and index.html, and all the archive method outputs. Create a single URL archive folder with an index.json and index.html, and all the archive method outputs.
You can run this to archive single pages without needing to create a whole collection with archivebox init. You can run this to archive single pages without needing to create a whole collection with archivebox init.
@ -523,7 +523,8 @@ def oneshot(url: str, out_dir: Path=OUTPUT_DIR):
color='red' color='red'
) )
raise SystemExit(2) raise SystemExit(2)
methods = ignore_methods(['title'])
methods = extractors.split(",") if extractors else ignore_methods(['title'])
archive_link(oneshot_link[0], out_dir=out_dir, methods=methods) archive_link(oneshot_link[0], out_dir=out_dir, methods=methods)
return oneshot_link return oneshot_link
@ -534,8 +535,8 @@ def add(urls: Union[str, List[str]],
index_only: bool=False, index_only: bool=False,
overwrite: bool=False, overwrite: bool=False,
init: bool=False, init: bool=False,
out_dir: Path=OUTPUT_DIR, extractors: str="",
extractors: str="") -> List[Link]: out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Add a new URL or list of URLs to your archive""" """Add a new URL or list of URLs to your archive"""
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'

View file

@ -1,6 +1,6 @@
{ {
"name": "archivebox", "name": "archivebox",
"version": "0.5.0", "version": "0.5.1",
"description": "ArchiveBox: The self-hosted internet archive", "description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>", "author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"license": "MIT", "license": "MIT",