"""
RefererMiddleware: populates Request referer field, based on the Response which
originated it.
"""
from __future__ import annotations
import warnings
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, cast
from urllib.parse import urlparse
from warnings import warn
from scrapy.exceptions import NotConfigured
from scrapy.http import Request, Response
from scrapy.spidermiddlewares.base import BaseSpiderMiddleware
from scrapy.utils.misc import load_object
from scrapy.utils.python import _looks_like_import_path, to_unicode
from scrapy.utils.url import strip_url
if TYPE_CHECKING:
# typing.Self requires Python 3.11
from typing_extensions import Self
from scrapy.crawler import Crawler
from scrapy.settings import BaseSettings
LOCAL_SCHEMES: tuple[str, ...] = (
"about",
"blob",
"data",
"filesystem",
)
POLICY_NO_REFERRER = "no-referrer"
POLICY_NO_REFERRER_WHEN_DOWNGRADE = "no-referrer-when-downgrade"
POLICY_SAME_ORIGIN = "same-origin"
POLICY_ORIGIN = "origin"
POLICY_STRICT_ORIGIN = "strict-origin"
POLICY_ORIGIN_WHEN_CROSS_ORIGIN = "origin-when-cross-origin"
POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = "strict-origin-when-cross-origin"
POLICY_UNSAFE_URL = "unsafe-url"
POLICY_SCRAPY_DEFAULT = "scrapy-default"
[docs]
class ReferrerPolicy(ABC):
"""Abstract base class for referrer policies."""
NOREFERRER_SCHEMES: tuple[str, ...] = LOCAL_SCHEMES
name: str
@abstractmethod
def referrer(self, response_url: str, request_url: str) -> str | None:
raise NotImplementedError
def stripped_referrer(self, url: str) -> str | None:
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
return self.strip_url(url)
return None
def origin_referrer(self, url: str) -> str | None:
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
return self.origin(url)
return None
def strip_url(self, url: str, origin_only: bool = False) -> str | None:
"""
https://www.w3.org/TR/referrer-policy/#strip-url
If url is null, return no referrer.
If url's scheme is a local scheme, then return no referrer.
Set url's username to the empty string.
Set url's password to null.
Set url's fragment to null.
If the origin-only flag is true, then:
Set url's path to null.
Set url's query to null.
Return url.
"""
if not url:
return None
return strip_url(
url,
strip_credentials=True,
strip_fragment=True,
strip_default_port=True,
origin_only=origin_only,
)
def origin(self, url: str) -> str | None:
"""Return serialized origin (scheme, host, path) for a request or response URL."""
return self.strip_url(url, origin_only=True)
def potentially_trustworthy(self, url: str) -> bool:
# Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
parsed_url = urlparse(url)
if parsed_url.scheme in ("data",):
return False
return self.tls_protected(url)
def tls_protected(self, url: str) -> bool:
return urlparse(url).scheme in ("https", "ftps")
[docs]
class NoReferrerPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
The simplest policy is "no-referrer", which specifies that no referrer information
is to be sent along with requests made from a particular request client to any origin.
The header will be omitted entirely.
"""
name: str = POLICY_NO_REFERRER
def referrer(self, response_url: str, request_url: str) -> str | None:
return None
[docs]
class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade
The "no-referrer-when-downgrade" policy sends a full URL along with requests
from a TLS-protected environment settings object to a potentially trustworthy URL,
and requests from clients which are not TLS-protected to any origin.
Requests from TLS-protected clients to non-potentially trustworthy URLs,
on the other hand, will contain no referrer information.
A Referer HTTP header will not be sent.
This is a user agent's default behavior, if no policy is otherwise specified.
"""
name: str = POLICY_NO_REFERRER_WHEN_DOWNGRADE
def referrer(self, response_url: str, request_url: str) -> str | None:
if not self.tls_protected(response_url) or self.tls_protected(request_url):
return self.stripped_referrer(response_url)
return None
[docs]
class SameOriginPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-same-origin
The "same-origin" policy specifies that a full URL, stripped for use as a referrer,
is sent as referrer information when making same-origin requests from a particular request client.
Cross-origin requests, on the other hand, will contain no referrer information.
A Referer HTTP header will not be sent.
"""
name: str = POLICY_SAME_ORIGIN
def referrer(self, response_url: str, request_url: str) -> str | None:
if self.origin(response_url) == self.origin(request_url):
return self.stripped_referrer(response_url)
return None
[docs]
class OriginPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-origin
The "origin" policy specifies that only the ASCII serialization
of the origin of the request client is sent as referrer information
when making both same-origin requests and cross-origin requests
from a particular request client.
"""
name: str = POLICY_ORIGIN
def referrer(self, response_url: str, request_url: str) -> str | None:
return self.origin_referrer(response_url)
[docs]
class StrictOriginPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin
The "strict-origin" policy sends the ASCII serialization
of the origin of the request client when making requests:
- from a TLS-protected environment settings object to a potentially trustworthy URL, and
- from non-TLS-protected environment settings objects to any origin.
Requests from TLS-protected request clients to non- potentially trustworthy URLs,
on the other hand, will contain no referrer information.
A Referer HTTP header will not be sent.
"""
name: str = POLICY_STRICT_ORIGIN
def referrer(self, response_url: str, request_url: str) -> str | None:
if (
self.tls_protected(response_url)
and self.potentially_trustworthy(request_url)
) or not self.tls_protected(response_url):
return self.origin_referrer(response_url)
return None
[docs]
class OriginWhenCrossOriginPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-origin-when-cross-origin
The "origin-when-cross-origin" policy specifies that a full URL,
stripped for use as a referrer, is sent as referrer information
when making same-origin requests from a particular request client,
and only the ASCII serialization of the origin of the request client
is sent as referrer information when making cross-origin requests
from a particular request client.
"""
name: str = POLICY_ORIGIN_WHEN_CROSS_ORIGIN
def referrer(self, response_url: str, request_url: str) -> str | None:
origin = self.origin(response_url)
if origin == self.origin(request_url):
return self.stripped_referrer(response_url)
return origin
[docs]
class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin-when-cross-origin
The "strict-origin-when-cross-origin" policy specifies that a full URL,
stripped for use as a referrer, is sent as referrer information
when making same-origin requests from a particular request client,
and only the ASCII serialization of the origin of the request client
when making cross-origin requests:
- from a TLS-protected environment settings object to a potentially trustworthy URL, and
- from non-TLS-protected environment settings objects to any origin.
Requests from TLS-protected clients to non- potentially trustworthy URLs,
on the other hand, will contain no referrer information.
A Referer HTTP header will not be sent.
"""
name: str = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN
def referrer(self, response_url: str, request_url: str) -> str | None:
origin = self.origin(response_url)
if origin == self.origin(request_url):
return self.stripped_referrer(response_url)
if (
self.tls_protected(response_url)
and self.potentially_trustworthy(request_url)
) or not self.tls_protected(response_url):
return self.origin_referrer(response_url)
return None
[docs]
class UnsafeUrlPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-unsafe-url
The "unsafe-url" policy specifies that a full URL, stripped for use as a referrer,
is sent along with both cross-origin requests
and same-origin requests made from a particular request client.
Note: The policy's name doesn't lie; it is unsafe.
This policy will leak origins and paths from TLS-protected resources
to insecure origins.
Carefully consider the impact of setting such a policy for potentially sensitive documents.
"""
name: str = POLICY_UNSAFE_URL
def referrer(self, response_url: str, request_url: str) -> str | None:
return self.stripped_referrer(response_url)
[docs]
class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
"""
A variant of "no-referrer-when-downgrade",
with the addition that "Referer" is not sent if the parent request was
using ``file://`` or ``s3://`` scheme.
"""
NOREFERRER_SCHEMES: tuple[str, ...] = (*LOCAL_SCHEMES, "file", "s3")
name: str = POLICY_SCRAPY_DEFAULT
[docs]
class RefererMiddleware(BaseSpiderMiddleware):
def __init__(self, settings: BaseSettings | None = None): # pylint: disable=super-init-not-called
self.default_policy: type[ReferrerPolicy] = DefaultReferrerPolicy
self.policies: dict[str, type[ReferrerPolicy]] = {
p.name: p
for p in (
NoReferrerPolicy,
NoReferrerWhenDowngradePolicy,
SameOriginPolicy,
OriginPolicy,
StrictOriginPolicy,
OriginWhenCrossOriginPolicy,
StrictOriginWhenCrossOriginPolicy,
UnsafeUrlPolicy,
DefaultReferrerPolicy,
)
}
# Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-empty-string
self.policies[""] = NoReferrerWhenDowngradePolicy
if settings is None:
return
setting_policies = settings.getdict("REFERRER_POLICIES")
for policy_name, policy_class_import_path in setting_policies.items():
if policy_class_import_path is None:
del self.policies[policy_name]
else:
self.policies[policy_name] = load_object(policy_class_import_path)
settings_policy = self._load_policy_class(
settings.get("REFERRER_POLICY"), allow_import_path=True
)
assert settings_policy
self.default_policy = settings_policy
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
if not crawler.settings.getbool("REFERER_ENABLED"):
raise NotConfigured
return cls(crawler.settings)
def policy(
self,
response: Response | str | None = None,
request: Request | None = None,
**kwargs,
) -> ReferrerPolicy:
"""Return the referrer policy to use for *request* based on *request*
meta, *response* and settings.
- if a valid policy is set in Request meta, it is used.
- if the policy is set in meta but is wrong (e.g. a typo error), the
policy from settings is used
- if the policy is not set in Request meta, but there is a
Referrer-Policy header in the parent response, it is used if valid
- otherwise, the policy from settings is used.
"""
if "resp_or_url" in kwargs:
if response is not None:
raise TypeError("Cannot pass both 'response' and 'resp_or_url'")
response = kwargs.pop("resp_or_url")
warn(
"Passing 'resp_or_url' is deprecated, use 'response' instead.",
DeprecationWarning,
stacklevel=2,
)
if response is None:
raise TypeError("Missing required argument: 'response'")
if request is None:
raise TypeError("Missing required argument: 'request'")
if isinstance(response, str):
warn(
"Passing a response URL to RefererMiddleware.policy() instead "
"of a Response object is deprecated.",
DeprecationWarning,
stacklevel=2,
)
allow_import_path = True
policy_name = request.meta.get("referrer_policy")
if policy_name is None and isinstance(response, Response):
policy_header = response.headers.get("Referrer-Policy")
if policy_header is not None:
policy_name = to_unicode(policy_header.decode("latin1"))
allow_import_path = False
if policy_name is None:
return self.default_policy()
cls = self._load_policy_class(
policy_name, warning_only=True, allow_import_path=allow_import_path
)
return cls() if cls else self.default_policy()
def _load_policy_class(
self,
policy: str,
warning_only: bool = False,
*,
allow_import_path: bool = False,
) -> type[ReferrerPolicy] | None:
"""Load the :class:`ReferrerPolicy` class to use for *policy*.
*policy* may be any of the following:
- A standard policy name, e.g. ``"no-referrer"``,
``"origin-when-cross-origin"``, etc.
- The special ``"scrapy-default"`` policy.
- The import path of a :class:`ReferrerPolicy` subclass, e.g.
``"scrapy.spidermiddlewares.referer.NoReferrerPolicy"`` or
``"myproject.policies.CustomReferrerPolicy"``.
If *warning_only* is ``False`` (default) and *policy* cannot be turned
into a :class:`ReferrerPolicy` subclass, a :exc:`RuntimeError` is
raised. If *warning_only* is ``True``, a warning is logged and ``None``
is returned instead.
If *allow_import_path* is ``False`` (default), import paths are not
allowed, resulting in :exc:`RuntimeError` or ``None``. If ``True``,
they are allowed. Use ``True`` only if you trust the source of the
*policy* value.
"""
if allow_import_path:
try:
return cast("type[ReferrerPolicy]", load_object(policy))
except ValueError:
pass
policy_names = [
policy_name.strip() for policy_name in policy.lower().split(",")
]
# https://www.w3.org/TR/referrer-policy/#parse-referrer-policy-from-header
for policy_name in policy_names[::-1]:
if policy_name in self.policies:
return self.policies[policy_name]
msg = f"Could not load referrer policy {policy!r}"
if not allow_import_path and _looks_like_import_path(policy):
msg += " (import paths from the response Referrer-Policy header are not allowed)"
if not warning_only:
raise RuntimeError(msg)
warnings.warn(msg, RuntimeWarning)
return None
def get_processed_request(
self, request: Request, response: Response | None
) -> Request | None:
if response is None:
# start requests
return request
referrer = self.policy(response, request).referrer(response.url, request.url)
if referrer is not None:
request.headers.setdefault("Referer", referrer)
return request