This commit is contained in:
ton
2024-10-07 10:13:40 +07:00
parent aa1631742f
commit 3a7d696db6
9729 changed files with 1832837 additions and 161742 deletions

View File

@@ -92,6 +92,8 @@ TOKEN_ENDS = TSPECIALS | WSP
ASPECIALS = TSPECIALS | set("*'%")
ATTRIBUTE_ENDS = ASPECIALS | WSP
EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
NLSET = {'\n', '\r'}
SPECIALSNL = SPECIALS | NLSET
def quote_string(value):
return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
@@ -566,12 +568,14 @@ class DisplayName(Phrase):
if res[0].token_type == 'cfws':
res.pop(0)
else:
if res[0][0].token_type == 'cfws':
if (isinstance(res[0], TokenList) and
res[0][0].token_type == 'cfws'):
res[0] = TokenList(res[0][1:])
if res[-1].token_type == 'cfws':
res.pop()
else:
if res[-1][-1].token_type == 'cfws':
if (isinstance(res[-1], TokenList) and
res[-1][-1].token_type == 'cfws'):
res[-1] = TokenList(res[-1][:-1])
return res.value
@@ -586,9 +590,13 @@ class DisplayName(Phrase):
quote = True
if len(self) != 0 and quote:
pre = post = ''
if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
if (self[0].token_type == 'cfws' or
isinstance(self[0], TokenList) and
self[0][0].token_type == 'cfws'):
pre = ' '
if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
if (self[-1].token_type == 'cfws' or
isinstance(self[-1], TokenList) and
self[-1][-1].token_type == 'cfws'):
post = ' '
return pre+quote_string(self.display_name)+post
else:
@@ -949,6 +957,8 @@ class _InvalidEwError(errors.HeaderParseError):
# up other parse trees. Maybe should have tests for that, too.
DOT = ValueTerminal('.', 'dot')
ListSeparator = ValueTerminal(',', 'list-separator')
ListSeparator.as_ew_allowed = False
ListSeparator.syntactic_break = False
RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
#
@@ -1206,7 +1216,7 @@ def get_bare_quoted_string(value):
value is the text between the quote marks, with whitespace
preserved and quoted pairs decoded.
"""
if value[0] != '"':
if not value or value[0] != '"':
raise errors.HeaderParseError(
"expected '\"' but found '{}'".format(value))
bare_quoted_string = BareQuotedString()
@@ -1447,7 +1457,7 @@ def get_local_part(value):
"""
local_part = LocalPart()
leader = None
if value[0] in CFWS_LEADER:
if value and value[0] in CFWS_LEADER:
leader, value = get_cfws(value)
if not value:
raise errors.HeaderParseError(
@@ -1513,13 +1523,18 @@ def get_obs_local_part(value):
raise
token, value = get_cfws(value)
obs_local_part.append(token)
if not obs_local_part:
raise errors.HeaderParseError(
"expected obs-local-part but found '{}'".format(value))
if (obs_local_part[0].token_type == 'dot' or
obs_local_part[0].token_type=='cfws' and
len(obs_local_part) > 1 and
obs_local_part[1].token_type=='dot'):
obs_local_part.defects.append(errors.InvalidHeaderDefect(
"Invalid leading '.' in local part"))
if (obs_local_part[-1].token_type == 'dot' or
obs_local_part[-1].token_type=='cfws' and
len(obs_local_part) > 1 and
obs_local_part[-2].token_type=='dot'):
obs_local_part.defects.append(errors.InvalidHeaderDefect(
"Invalid trailing '.' in local part"))
@@ -1601,7 +1616,7 @@ def get_domain(value):
"""
domain = Domain()
leader = None
if value[0] in CFWS_LEADER:
if value and value[0] in CFWS_LEADER:
leader, value = get_cfws(value)
if not value:
raise errors.HeaderParseError(
@@ -1677,6 +1692,8 @@ def get_obs_route(value):
if value[0] in CFWS_LEADER:
token, value = get_cfws(value)
obs_route.append(token)
if not value:
break
if value[0] == '@':
obs_route.append(RouteComponentMarker)
token, value = get_domain(value[1:])
@@ -1695,7 +1712,7 @@ def get_angle_addr(value):
"""
angle_addr = AngleAddr()
if value[0] in CFWS_LEADER:
if value and value[0] in CFWS_LEADER:
token, value = get_cfws(value)
angle_addr.append(token)
if not value or value[0] != '<':
@@ -1705,7 +1722,7 @@ def get_angle_addr(value):
value = value[1:]
# Although it is not legal per RFC5322, SMTP uses '<>' in certain
# circumstances.
if value[0] == '>':
if value and value[0] == '>':
angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
angle_addr.defects.append(errors.InvalidHeaderDefect(
"null addr-spec in angle-addr"))
@@ -1757,6 +1774,9 @@ def get_name_addr(value):
name_addr = NameAddr()
# Both the optional display name and the angle-addr can start with cfws.
leader = None
if not value:
raise errors.HeaderParseError(
"expected name-addr but found '{}'".format(value))
if value[0] in CFWS_LEADER:
leader, value = get_cfws(value)
if not value:
@@ -1771,7 +1791,10 @@ def get_name_addr(value):
raise errors.HeaderParseError(
"expected name-addr but found '{}'".format(token))
if leader is not None:
token[0][:0] = [leader]
if isinstance(token[0], TokenList):
token[0][:0] = [leader]
else:
token[:0] = [leader]
leader = None
name_addr.append(token)
token, value = get_angle_addr(value)
@@ -2022,7 +2045,7 @@ def get_address_list(value):
address_list.defects.append(errors.InvalidHeaderDefect(
"invalid address in address-list"))
if value: # Must be a , at this point.
address_list.append(ValueTerminal(',', 'list-separator'))
address_list.append(ListSeparator)
value = value[1:]
return address_list, value
@@ -2764,10 +2787,15 @@ def _refold_parse_tree(parse_tree, *, policy):
# max_line_length 0/None means no limit, ie: infinitely long.
maxlen = policy.max_line_length or sys.maxsize
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = ['']
last_ew = None
lines = [''] # Folded lines to be output
leading_whitespace = '' # When we have whitespace between two encoded
# words, we may need to encode the whitespace
# at the beginning of the second word.
last_ew = None # Points to the last encoded character if there's an ew on
# the line
last_charset = None
wrap_as_ew_blocked = 0
want_encoding = False
want_encoding = False # This is set to True if we need to encode this part
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
parts = list(parse_tree)
while parts:
@@ -2776,9 +2804,13 @@ def _refold_parse_tree(parse_tree, *, policy):
wrap_as_ew_blocked -= 1
continue
tstr = str(part)
if part.token_type == 'ptext' and set(tstr) & SPECIALS:
# Encode if tstr contains special characters.
want_encoding = True
if not want_encoding:
if part.token_type == 'ptext':
# Encode if tstr contains special characters.
want_encoding = not SPECIALSNL.isdisjoint(tstr)
else:
# Encode if tstr contains newlines.
want_encoding = not NLSET.isdisjoint(tstr)
try:
tstr.encode(encoding)
charset = encoding
@@ -2791,10 +2823,12 @@ def _refold_parse_tree(parse_tree, *, policy):
# 'charset' property on the policy.
charset = 'utf-8'
want_encoding = True
if part.token_type == 'mime-parameters':
# Mime parameter folding (using RFC2231) is extra special.
_fold_mime_parameters(part, lines, maxlen, encoding)
continue
if want_encoding and not wrap_as_ew_blocked:
if not part.as_ew_allowed:
want_encoding = False
@@ -2817,24 +2851,55 @@ def _refold_parse_tree(parse_tree, *, policy):
if not hasattr(part, 'encode'):
# It's not a Terminal, do each piece individually.
parts = list(part) + parts
else:
want_encoding = False
continue
elif part.as_ew_allowed:
# It's a terminal, wrap it as an encoded word, possibly
# combining it with previously encoded words if allowed.
if (last_ew is not None and
charset != last_charset and
(last_charset == 'unknown-8bit' or
last_charset == 'utf-8' and charset != 'us-ascii')):
last_ew = None
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
part.ew_combine_allowed, charset)
want_encoding = False
continue
part.ew_combine_allowed, charset, leading_whitespace)
# This whitespace has been added to the lines in _fold_as_ew()
# so clear it now.
leading_whitespace = ''
last_charset = charset
want_encoding = False
continue
else:
# It's a terminal which should be kept non-encoded
# (e.g. a ListSeparator).
last_ew = None
want_encoding = False
# fall through
if len(tstr) <= maxlen - len(lines[-1]):
lines[-1] += tstr
continue
# This part is too long to fit. The RFC wants us to break at
# "major syntactic breaks", so unless we don't consider this
# to be one, check if it will fit on the next line by itself.
leading_whitespace = ''
if (part.syntactic_break and
len(tstr) + 1 <= maxlen):
newline = _steal_trailing_WSP_if_exists(lines)
if newline or part.startswith_fws():
# We're going to fold the data onto a new line here. Due to
# the way encoded strings handle continuation lines, we need to
# be prepared to encode any whitespace if the next line turns
# out to start with an encoded word.
lines.append(newline + tstr)
whitespace_accumulator = []
for char in lines[-1]:
if char not in WSP:
break
whitespace_accumulator.append(char)
leading_whitespace = ''.join(whitespace_accumulator)
last_ew = None
continue
if not hasattr(part, 'encode'):
@@ -2858,9 +2923,10 @@ def _refold_parse_tree(parse_tree, *, policy):
else:
# We can't fold it onto the next line either...
lines[-1] += tstr
return policy.linesep.join(lines) + policy.linesep
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
"""Fold string to_encode into lines as encoded word, combining if allowed.
Return the new value for last_ew, or None if ew_combine_allowed is False.
@@ -2875,7 +2941,7 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
to_encode = str(
get_unstructured(lines[-1][last_ew:] + to_encode))
lines[-1] = lines[-1][:last_ew]
if to_encode[0] in WSP:
elif to_encode[0] in WSP:
# We're joining this to non-encoded text, so don't encode
# the leading blank.
leading_wsp = to_encode[0]
@@ -2883,6 +2949,7 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
if (len(lines[-1]) == maxlen):
lines.append(_steal_trailing_WSP_if_exists(lines))
lines[-1] += leading_wsp
trailing_wsp = ''
if to_encode[-1] in WSP:
# Likewise for the trailing space.
@@ -2902,11 +2969,20 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
while to_encode:
remaining_space = maxlen - len(lines[-1])
text_space = remaining_space - chrome_len
text_space = remaining_space - chrome_len - len(leading_whitespace)
if text_space <= 0:
lines.append(' ')
continue
# If we are at the start of a continuation line, prepend whitespace
# (we only want to do this when the line starts with an encoded word
# but if we're folding in this helper function, then we know that we
# are going to be writing out an encoded word.)
if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
lines[-1] += encoded_word
leading_whitespace = ''
to_encode_word = to_encode[:text_space]
encoded_word = _ew.encode(to_encode_word, charset=encode_as)
excess = len(encoded_word) - remaining_space
@@ -2918,6 +2994,7 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
excess = len(encoded_word) - remaining_space
lines[-1] += encoded_word
to_encode = to_encode[len(to_encode_word):]
leading_whitespace = ''
if to_encode:
lines.append(' ')

View File

@@ -152,11 +152,18 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta):
mangle_from_ -- a flag that, when True escapes From_ lines in the
body of the message by putting a `>' in front of
them. This is used when the message is being
serialized by a generator. Default: True.
serialized by a generator. Default: False.
message_factory -- the class to use to create new message objects.
If the value is None, the default is Message.
verify_generated_headers
-- if true, the generator verifies that each header
they are properly folded, so that a parser won't
treat it as multiple headers, start-of-body, or
part of another header.
This is a check against custom Header & fold()
implementations.
"""
raise_on_defect = False
@@ -165,6 +172,7 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta):
max_line_length = 78
mangle_from_ = False
message_factory = None
verify_generated_headers = True
def handle_defect(self, obj, defect):
"""Based on policy, either raise defect or call register_defect.

View File

@@ -29,6 +29,10 @@ class CharsetError(MessageError):
"""An illegal charset was given."""
class HeaderWriteError(MessageError):
"""Error while writing headers."""
# These are parsing defects which the parser was able to work around.
class MessageDefect(ValueError):
"""Base class for a message defect."""

View File

@@ -14,12 +14,14 @@ import random
from copy import deepcopy
from io import StringIO, BytesIO
from email.utils import _has_surrogates
from email.errors import HeaderWriteError
UNDERSCORE = '_'
NL = '\n' # XXX: no longer used by the code below.
NLCRE = re.compile(r'\r\n|\r|\n')
fcre = re.compile(r'^From ', re.MULTILINE)
NEWLINE_WITHOUT_FWSP = re.compile(r'\r\n[^ \t]|\r[^ \n\t]|\n[^ \t]')
class Generator:
@@ -222,7 +224,16 @@ class Generator:
def _write_headers(self, msg):
for h, v in msg.raw_items():
self.write(self.policy.fold(h, v))
folded = self.policy.fold(h, v)
if self.policy.verify_generated_headers:
linesep = self.policy.linesep
if not folded.endswith(self.policy.linesep):
raise HeaderWriteError(
f'folded header does not end with {linesep!r}: {folded!r}')
if NEWLINE_WITHOUT_FWSP.search(folded.removesuffix(linesep)):
raise HeaderWriteError(
f'folded header contains newline: {folded!r}')
self.write(folded)
# A blank line always separates headers from body
self.write(self._NL)
@@ -243,7 +254,7 @@ class Generator:
# existing message.
msg = deepcopy(msg)
del msg['content-transfer-encoding']
msg.set_payload(payload, charset)
msg.set_payload(msg._payload, charset)
payload = msg.get_payload()
self._munge_cte = (msg['content-transfer-encoding'],
msg['content-type'])

View File

@@ -289,25 +289,26 @@ class Message:
# cte might be a Header, so for now stringify it.
cte = str(self.get('content-transfer-encoding', '')).lower()
# payload may be bytes here.
if isinstance(payload, str):
if utils._has_surrogates(payload):
bpayload = payload.encode('ascii', 'surrogateescape')
if not decode:
if not decode:
if isinstance(payload, str) and utils._has_surrogates(payload):
try:
bpayload = payload.encode('ascii', 'surrogateescape')
try:
payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
payload = bpayload.decode(self.get_content_charset('ascii'), 'replace')
except LookupError:
payload = bpayload.decode('ascii', 'replace')
elif decode:
try:
bpayload = payload.encode('ascii')
except UnicodeError:
# This won't happen for RFC compliant messages (messages
# containing only ASCII code points in the unicode input).
# If it does happen, turn the string into bytes in a way
# guaranteed not to fail.
bpayload = payload.encode('raw-unicode-escape')
if not decode:
except UnicodeEncodeError:
pass
return payload
if isinstance(payload, str):
try:
bpayload = payload.encode('ascii', 'surrogateescape')
except UnicodeEncodeError:
# This won't happen for RFC compliant messages (messages
# containing only ASCII code points in the unicode input).
# If it does happen, turn the string into bytes in a way
# guaranteed not to fail.
bpayload = payload.encode('raw-unicode-escape')
if cte == 'quoted-printable':
return quopri.decodestring(bpayload)
elif cte == 'base64':
@@ -339,7 +340,7 @@ class Message:
return
if not isinstance(charset, Charset):
charset = Charset(charset)
payload = payload.encode(charset.output_charset)
payload = payload.encode(charset.output_charset, 'surrogateescape')
if hasattr(payload, 'decode'):
self._payload = payload.decode('ascii', 'surrogateescape')
else:

View File

@@ -21,7 +21,7 @@ __all__ = [
'HTTP',
]
linesep_splitter = re.compile(r'\n|\r')
linesep_splitter = re.compile(r'\n|\r\n?')
@_extend_docstrings
class EmailPolicy(Policy):
@@ -205,13 +205,21 @@ class EmailPolicy(Policy):
if hasattr(value, 'name'):
return value.fold(policy=self)
maxlen = self.max_line_length if self.max_line_length else sys.maxsize
lines = value.splitlines()
# We can't use splitlines here because it splits on more than \r and \n.
lines = linesep_splitter.split(value)
refold = (self.refold_source == 'all' or
self.refold_source == 'long' and
(lines and len(lines[0])+len(name)+2 > maxlen or
any(len(x) > maxlen for x in lines[1:])))
if refold or refold_binary and _has_surrogates(value):
if not refold:
if not self.utf8:
refold = not value.isascii()
elif refold_binary:
refold = _has_surrogates(value)
if refold:
return self.header_factory(name, ''.join(lines)).fold(policy=self)
return name + ': ' + self.linesep.join(lines) + self.linesep

View File

@@ -48,11 +48,12 @@ TICK = "'"
specialsre = re.compile(r'[][\\()<>@,:;".]')
escapesre = re.compile(r'[\\"]')
def _has_surrogates(s):
"""Return True if s contains surrogate-escaped binary data."""
"""Return True if s may contain surrogate-escaped binary data."""
# This check is based on the fact that unless there are surrogates, utf8
# (Python's default encoding) can encode any string. This is the fastest
# way to check for surrogates, see issue 11454 for timings.
# way to check for surrogates, see bpo-11454 (moved to gh-55663) for timings.
try:
s.encode()
return False
@@ -106,12 +107,127 @@ def formataddr(pair, charset='utf-8'):
return address
def _iter_escaped_chars(addr):
pos = 0
escape = False
for pos, ch in enumerate(addr):
if escape:
yield (pos, '\\' + ch)
escape = False
elif ch == '\\':
escape = True
else:
yield (pos, ch)
if escape:
yield (pos, '\\')
def getaddresses(fieldvalues):
"""Return a list of (REALNAME, EMAIL) for each fieldvalue."""
all = COMMASPACE.join(str(v) for v in fieldvalues)
a = _AddressList(all)
return a.addresslist
def _strip_quoted_realnames(addr):
"""Strip real names between quotes."""
if '"' not in addr:
# Fast path
return addr
start = 0
open_pos = None
result = []
for pos, ch in _iter_escaped_chars(addr):
if ch == '"':
if open_pos is None:
open_pos = pos
else:
if start != open_pos:
result.append(addr[start:open_pos])
start = pos + 1
open_pos = None
if start < len(addr):
result.append(addr[start:])
return ''.join(result)
supports_strict_parsing = True
def getaddresses(fieldvalues, *, strict=True):
"""Return a list of (REALNAME, EMAIL) or ('','') for each fieldvalue.
When parsing fails for a fieldvalue, a 2-tuple of ('', '') is returned in
its place.
If strict is true, use a strict parser which rejects malformed inputs.
"""
# If strict is true, if the resulting list of parsed addresses is greater
# than the number of fieldvalues in the input list, a parsing error has
# occurred and consequently a list containing a single empty 2-tuple [('',
# '')] is returned in its place. This is done to avoid invalid output.
#
# Malformed input: getaddresses(['alice@example.com <bob@example.com>'])
# Invalid output: [('', 'alice@example.com'), ('', 'bob@example.com')]
# Safe output: [('', '')]
if not strict:
all = COMMASPACE.join(str(v) for v in fieldvalues)
a = _AddressList(all)
return a.addresslist
fieldvalues = [str(v) for v in fieldvalues]
fieldvalues = _pre_parse_validation(fieldvalues)
addr = COMMASPACE.join(fieldvalues)
a = _AddressList(addr)
result = _post_parse_validation(a.addresslist)
# Treat output as invalid if the number of addresses is not equal to the
# expected number of addresses.
n = 0
for v in fieldvalues:
# When a comma is used in the Real Name part it is not a deliminator.
# So strip those out before counting the commas.
v = _strip_quoted_realnames(v)
# Expected number of addresses: 1 + number of commas
n += 1 + v.count(',')
if len(result) != n:
return [('', '')]
return result
def _check_parenthesis(addr):
# Ignore parenthesis in quoted real names.
addr = _strip_quoted_realnames(addr)
opens = 0
for pos, ch in _iter_escaped_chars(addr):
if ch == '(':
opens += 1
elif ch == ')':
opens -= 1
if opens < 0:
return False
return (opens == 0)
def _pre_parse_validation(email_header_fields):
accepted_values = []
for v in email_header_fields:
if not _check_parenthesis(v):
v = "('', '')"
accepted_values.append(v)
return accepted_values
def _post_parse_validation(parsed_email_header_tuples):
accepted_values = []
# The parser would have parsed a correctly formatted domain-literal
# The existence of an [ after parsing indicates a parsing failure
for v in parsed_email_header_tuples:
if '[' in v[1]:
v = ('', '')
accepted_values.append(v)
return accepted_values
def _format_timetuple_and_zone(timetuple, zone):
@@ -128,7 +244,7 @@ def formatdate(timeval=None, localtime=False, usegmt=False):
Fri, 09 Nov 2001 01:08:47 -0000
Optional timeval if given is a floating point time value as accepted by
Optional timeval if given is a floating-point time value as accepted by
gmtime() and localtime(), otherwise the current time is used.
Optional localtime is a flag that when True, interprets timeval, and
@@ -205,16 +321,33 @@ def parsedate_to_datetime(data):
tzinfo=datetime.timezone(datetime.timedelta(seconds=tz)))
def parseaddr(addr):
def parseaddr(addr, *, strict=True):
"""
Parse addr into its constituent realname and email address parts.
Return a tuple of realname and email address, unless the parse fails, in
which case return a 2-tuple of ('', '').
If strict is True, use a strict parser which rejects malformed inputs.
"""
addrs = _AddressList(addr).addresslist
if not addrs:
return '', ''
if not strict:
addrs = _AddressList(addr).addresslist
if not addrs:
return ('', '')
return addrs[0]
if isinstance(addr, list):
addr = addr[0]
if not isinstance(addr, str):
return ('', '')
addr = _pre_parse_validation([addr])[0]
addrs = _post_parse_validation(_AddressList(addr).addresslist)
if not addrs or len(addrs) > 1:
return ('', '')
return addrs[0]