我有一个函数来将给定的字符串截断为给定的字节长度:
LENGTH_BY_PREFIX = [
(0xC0, 2), # first byte mask, total codepoint length
(0xE0, 3),
(0xF0, 4),
(0xF8, 5),
(0xFC, 6),
]
def codepoint_length(first_byte):
if first_byte < 128:
return 1 # ASCII
for mask, length in LENGTH_BY_PREFIX:
if first_byte & mask == mask:
return length
assert False, 'Invalid byte %r' % first_byte
def cut_string_to_bytes_length(unicode_text, byte_limit):
utf8_bytes = unicode_text.encode('UTF-8')
cut_index = 0
while cut_index < len(utf8_bytes):
step = codepoint_length(ord(utf8_bytes[cut_index]))
if cut_index + step > byte_limit:
# can't go a whole codepoint …Run Code Online (Sandbox Code Playgroud)