add HTTP byte range request support to media file serving

This commit is contained in:
Nick Sweeting 2024-09-05 21:41:49 -07:00
parent ba6c1fd69b
commit c76c50e71f
No known key found for this signature in database
4 changed files with 181 additions and 8 deletions

View file

@ -68,7 +68,7 @@ def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: It
else:
return tries
raise Exception('Background threads failed to exit after {tries}s: {threads_summary}')
raise Exception(f'Background threads failed to exit after {tries}s: {threads_summary}')
def list_subcommands() -> Dict[str, str]:

View file

@ -0,0 +1,169 @@
import os
import stat
import posixpath
import mimetypes
from pathlib import Path
from django.contrib.staticfiles import finders
from django.views import static
from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpResponseNotModified
from django.utils._os import safe_join
from django.utils.http import http_date
from django.utils.translation import gettext as _
def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False):
"""
Overrides Django's built-in django.views.static.serve function to support byte range requests.
This allows you to do things like seek into the middle of a huge mp4 or WACZ without downloading the whole file.
https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d
"""
assert document_root
path = posixpath.normpath(path).lstrip("/")
fullpath = Path(safe_join(document_root, path))
if fullpath.is_dir():
if show_indexes:
return static.directory_index(path, fullpath)
raise Http404(_("Directory indexes are not allowed here."))
if not fullpath.exists():
raise Http404(_("%(path)s” does not exist") % {"path": fullpath})
# Respect the If-Modified-Since header.
statobj = fullpath.stat()
if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
return HttpResponseNotModified()
content_type, encoding = mimetypes.guess_type(str(fullpath))
content_type = content_type or "application/octet-stream"
# setup resposne object
ranged_file = RangedFileReader(open(fullpath, "rb"))
response = StreamingHttpResponse(ranged_file, content_type=content_type)
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
# handle byte-range requests by serving chunk of file
if stat.S_ISREG(statobj.st_mode):
size = statobj.st_size
response["Content-Length"] = size
response["Accept-Ranges"] = "bytes"
response["X-Django-Ranges-Supported"] = "1"
# Respect the Range header.
if "HTTP_RANGE" in request.META:
try:
ranges = parse_range_header(request.META['HTTP_RANGE'], size)
except ValueError:
ranges = None
# only handle syntactically valid headers, that are simple (no
# multipart byteranges)
if ranges is not None and len(ranges) == 1:
start, stop = ranges[0]
if stop > size:
# requested range not satisfiable
return HttpResponse(status=416)
ranged_file.start = start
ranged_file.stop = stop
response["Content-Range"] = "bytes %d-%d/%d" % (start, stop - 1, size)
response["Content-Length"] = stop - start
response.status_code = 206
if encoding:
response.headers["Content-Encoding"] = encoding
return response
def serve_static(request, path, **kwargs):
"""
Serve static files below a given point in the directory structure or
from locations inferred from the staticfiles finders.
To use, put a URL pattern such as::
from django.contrib.staticfiles import views
path('<path:path>', views.serve)
in your URLconf.
It uses the django.views.static.serve() view to serve the found files.
"""
normalized_path = posixpath.normpath(path).lstrip("/")
absolute_path = finders.find(normalized_path)
if not absolute_path:
if path.endswith("/") or path == "":
raise Http404("Directory indexes are not allowed here.")
raise Http404("'%s' could not be found" % path)
document_root, path = os.path.split(absolute_path)
return serve_static_with_byterange_support(request, path, document_root=document_root, **kwargs)
def parse_range_header(header, resource_size):
"""
Parses a range header into a list of two-tuples (start, stop) where `start`
is the starting byte of the range (inclusive) and `stop` is the ending byte
position of the range (exclusive).
Returns None if the value of the header is not syntatically valid.
https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d
"""
if not header or "=" not in header:
return None
ranges = []
units, range_ = header.split("=", 1)
units = units.strip().lower()
if units != "bytes":
return None
for val in range_.split(","):
val = val.strip()
if "-" not in val:
return None
if val.startswith("-"):
# suffix-byte-range-spec: this form specifies the last N bytes of an
# entity-body
start = resource_size + int(val)
if start < 0:
start = 0
stop = resource_size
else:
# byte-range-spec: first-byte-pos "-" [last-byte-pos]
start, stop = val.split("-", 1)
start = int(start)
# the +1 is here since we want the stopping point to be exclusive, whereas in
# the HTTP spec, the last-byte-pos is inclusive
stop = int(stop) + 1 if stop else resource_size
if start >= stop:
return None
ranges.append((start, stop))
return ranges
class RangedFileReader:
"""
Wraps a file like object with an iterator that runs over part (or all) of
the file defined by start and stop. Blocks of block_size will be returned
from the starting position, up to, but not including the stop point.
https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d
"""
block_size = 8192
def __init__(self, file_like, start=0, stop=float("inf"), block_size=None):
self.f = file_like
self.block_size = block_size or RangedFileReader.block_size
self.start = start
self.stop = stop
def __iter__(self):
self.f.seek(self.start)
position = self.start
while position < self.stop:
data = self.f.read(min(self.block_size, self.stop - position))
if not data:
break
yield data
position += self.block_size

View file

@ -1,14 +1,13 @@
__package__ = 'archivebox.core'
from django.urls import path, include
from django.urls import path, re_path, include
from django.views import static
from django.contrib.staticfiles.urls import staticfiles_urlpatterns
from django.conf import settings
from django.views.generic.base import RedirectView
from .admin import archivebox_admin
from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
from .serve_static import serve_static
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
@ -18,13 +17,16 @@ from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthC
# print('DEBUG', settings.DEBUG)
urlpatterns = [
path('public/', PublicIndexView.as_view(), name='public-index'),
re_path(r"^static/(?P<path>.*)$", serve_static),
# re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}),
path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
path('public/', PublicIndexView.as_view(), name='public-index'),
path('archive/', RedirectView.as_view(url='/')),
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
@ -41,7 +43,7 @@ urlpatterns = [
path("api/", include('api.urls'), name='api'),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda *_: 1/0),
path('error/', lambda *_: 1/0), # type: ignore
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
@ -49,7 +51,6 @@ urlpatterns = [
path('index.json', static.serve, {'document_root': settings.CONFIG.OUTPUT_DIR, 'path': 'index.json'}),
path('', HomepageView.as_view(), name='Home'),
]
urlpatterns += staticfiles_urlpatterns()
if settings.DEBUG_TOOLBAR:
urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]

View file

@ -46,6 +46,7 @@ from ..main import add
from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str
from ..search import query_search_index
from ..extractors.wget import wget_output_path
from .serve_static import serve_static_with_byterange_support
class HomepageView(View):
@ -197,7 +198,9 @@ class SnapshotView(View):
# if they requested snapshot index, serve live rendered template instead of static html
response = self.render_live_index(request, snapshot)
else:
response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
response = serve_static_with_byterange_support(
request, archivefile, document_root=snapshot.link_dir, show_indexes=True,
)
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
return response
except Snapshot.DoesNotExist: