File: LP6E/Chapter37/source-encoding-utf8.py
# -*- coding: UTF-8 -*-
#----------------------------------------------------------------------------
# Demo all the ways to code non-ASCII text in Python, plus source encodings.
#
# If this file is saved as Latin-1 text, it works as is. But changing the
# coding line above to either ASCII or UTF-8 will then fail because the
# Latin-1 0xc4 and 0xe8 saved in myStr1's value are not valid in either.
#
# A UTF-8 line works if this file is also saved as UTF-8 to make its mystr1
# text match. Because UTF-8 is the default for source, the line above is
# optional if the file is saved as UTF-8 or its text is all UTF-8 compatible
# (e.g., ASCII, which is a subset of both the Latin-1 and UTF-8 encodings).
#----------------------------------------------------------------------------
myStr1 = 'AÄBèC' # Raw, per source encoding
myStr2 = 'A\xc4B\xe8C' # Hex code-point escapes
myStr3 = 'A\u00c4B\U000000e8C' # Unicode short/long escapes
myStr4 = 'A' + chr(0xC4) + 'B' + chr(0xE8) + 'C' # Concatenated code points
import sys, locale
print('Sys hosting platform: ', sys.platform)
print('Sys default encoding: ', sys.getdefaultencoding())
print('Open default encoding:', locale.getpreferredencoding(False))
for aStr in (myStr1, myStr2, myStr3, myStr4):
print(f'{aStr}, strlen={len(aStr)}', end=', ') # Decoded text+length
bytes1 = aStr.encode() # Default UTF-8: 2 bytes for accents
bytes2 = aStr.encode('latin-1') # Explicit Latin-1: 1 byte per char
#bytes3 = aStr.encode('ascii') # ASCII fails: outside 0...127 range
print(f'byteslen1={len(bytes1)}, byteslen2={len(bytes2)}') # Encoded length