Attribute VB_Name = "ADOStream" 'Patrick O'Beirne, Systems Modelling Ltd 'pob at sysmod.com 'http://www.sysmod.com 'http://ie.linkedin.com/in/patrickobeirne 'http://sysmod.wordpress.com 't: @ExcelAnalytics 'https://sysmod.wordpress.com/2020/09/28/vba-read-unicode-file-contents-in-various-encodings/ 'In VBA, the legacy VB Open statement, and the functions Dir(), Filelen(), FileDateTime() _ cannot handle Unicode file names and folders. The Filesystem Stream object can read Unicode names _ but cannot handle UTF-8 encoding. So, I use the ADO.Stream object to handle a number of different cases. _ The variations are not complete but it handles the file types I get. _ Let me know of any improvements you can suggest. 'The functions here mostly return Err.Number as long, and other data is returned byref in the parameters 'https://sysmod.wordpress.com/2021/01/12/reading-unicode-file-contents-as-text/ Option Explicit Option Private Module Private Const SUCCESS As Long = 0 'MessageBoxW correctly displays international Unicode characters Private Declare PtrSafe Function MessageBoxW Lib "USER32" _ (Optional ByVal hWnd As LongPtr, Optional ByVal Prompt As LongPtr, _ Optional ByVal Caption As LongPtr, Optional ByVal Buttons As Long) As Long 'https://docs.microsoft.com/en-us/windows/win32/api/winuser/nf-winuser-messageboxw 'int MessageBoxW( HWND hWnd, LPCWSTR lpText, LPCWSTR lpCaption, UINT uType); 'MB_OK 0 one push button: OK. This is the default. 'MB_OKCANCEL 1 OK and Cancel. 'MB_ABORTRETRYIGNORE 2 Abort, Retry, and Ignore. 'MB_YESNOCANCEL 3 Yes, No, and Cancel. 'MB_YESNO 4 Yes and No. 'MB_RETRYCANCEL 5 Retry and Cancel. 'MB_CANCELTRYCONTINUE 6 Cancel, Try Again, Continue. Function MessageBox(Prompt As String, Optional Buttons As Long = VbMsgBoxStyle.vbOKOnly, Optional Title As String = "Message") As VbMsgBoxResult 'parameter order compatible with msgbox prompt,VbMsgBoxStylebuttons,title[,helpfile,context] 'MB_ constants are the same as VbMsgBoxStyle ' only first 2048 chars of prompt in case it's too long MessageBox = MessageBoxW(Prompt:=StrPtr(Left$(Prompt, 2048)), Caption:=StrPtr(Title), Buttons:=Buttons + vbMsgBoxSetForeground) End Function ' infer charset from Byte Order Marker if possible. ReadBOM Function ReadFileBOMCharset(ByVal filename As String, ByRef charset As String) As Long Dim bytes() As Byte ReadFileBOMCharset = ReadADOStreamBytes(filename, bytes, 3) If ReadFileBOMCharset = SUCCESS Then charset = CharsetFromBytes(bytes) End If End Function 'https://en.wikipedia.org/wiki/Byte_order_mark 'Byte Order Mark (BOM) - FEFF - EFBBBF 'Encoding (hexadecimal) (decimal) Bytes as CP1252 characters 'UTF-8 EF BB BF 239 187 191  'UTF-16 (BE) FE FF 254 255 þÿ 'UTF-16 (LE) FF FE 255 254 ÿþ ' header is 239,187,191 EFBBBF 'When reading the file, a Windows application converts the endianness, invisibly from the user. 'An example of the first case is the binary XLS file format that is portable between Windows and Mac systems ' and always little-endian, leaving the Mac application to swap the bytes on load and save ' when running on a big-endian Motorola 68K or PowerPC processor. ' pass in first 3 bytes of a file, infer charset if there is any Byte Order Marker Function CharsetFromBytes(bytes() As Byte) As String If bytes(0) = 239 And bytes(1) = 187 And bytes(2) = 191 Then ' EF BB BF 'BOMlen = 3 &HEFBBBF CharsetFromBytes = "UTF-8" Else If bytes(0) = 255 And bytes(1) = 254 Then ' FF FE 'BOMlen = 2 ' &HFFEE CharsetFromBytes = "UTF-16LE" ' aka "Unicode" ElseIf bytes(0) = 254 And bytes(1) = 255 Then ' FE FF 'BOMlen = 2 ' &HFEFF CharsetFromBytes = "UTF-16BE" ' aka "BigEndianUnicode" Else ' no BOM, no idea what charset 'BOMlen = 0 CharsetFromBytes = vbNullString End If End If End Function 'https://social.msdn.microsoft.com/Forums/en-US/4478b233-51df-42e8-8938-85c8f6c75f82/read-unicode-character-using-openrowset 'Windows code page Name System.Text.Encoding schema.ini CharacterSet '20127 ASCII (US) ASCII 20127 '1252 ANSI Latin I Default ANSI '65001 UTF-8 UTF8 65001 '1200 UTF-16 LE Unicode Unicode '1201 UTF-16 BE BigEndianUnicode 1201 ' GUESS the character encoding, either known from BOM, or inferred from entire contents Function ReadADOStreamFile(ByVal FullName As String, ByRef text As String, _ ByRef charset As String, Optional ByRef numchars As Long = -1) As Long ' Check first 3 bytes for any BOM If ReadFileBOMCharset(FullName, charset) = SUCCESS Then If Len(charset) > 0 Then ' file has a BOM and so we know the charset ReadADOStreamFile = ReadADOStreamText(FullName, text, charset, numchars) Else ' no BOM found or inferred, last chance, infer from UTF-8 rendition ' first guess. If it's not UTF-8 it may still be readable as UTF-8 ' UTF-8 files may not have a BOM charset = "UTF-8" ReadADOStreamFile = ReadADOStreamText(FullName, text, charset, numchars) If ReadADOStreamFile = SUCCESS Then Dim lUTF8Len As Long lUTF8Len = Len(text) If InStr(1, text, ChrW$(&HFFFD), vbBinaryCompare) > 0 Then ' 65533 ' Check for U+FFFD replacement character ' used to replace an unknown, unrecognized or unrepresentable character 'https://en.wikipedia.org/wiki/Specials_(Unicode_block) 'https://en.wikipedia.org/wiki/Mojibake ' second chance, try Windows code page 1252, a superset of ISO-8859-1, Latin 1 charset = "Windows-1252" ReadADOStreamFile = ReadADOStreamText(FullName, text, charset, numchars) 'If Len(Text) > lUTF8Len Then 'Debug.Print charset & " converted some characters into multiple characters" ' so it must be some other encoding. No idea. 'End If Else 'There are two UTF-16 encodings supported by ADODB.Stream: _ little-endian UTF-16LE (which is the default and synonymous with UTF-16) _ and big-endian UTF-16BE, with the byte order reversed. ' No sure way to detect UTF-16 Unicode without a BOM ' we could check if every chr(10) is followed by a chr(0). This is simpler: If InStr(1, text, Chr$(0)) > 0 Then ' UTF-16 or nulls? Might be double-byte character set. ' but if it was it should have had a BOM and been detected by ReadFileBOMchars above charset = "UTF-16" ' same as little-endian UTF-16LE ReadADOStreamFile = ReadADOStreamText(FullName, text, charset, numchars) 'Debug.Print charset & " len=" & Format$(Len(Text), "#,###") & ", utf-8 len=" & lUTF8Len 'If lUTF8Len = Len(Text) * 2 Then ' ReadFSOStreamText(FileName, Text) would work here too for UCS2 ' Debug.Print "As expected, double byte character set" 'End If End If End If Else MessageBox FullName & vbLf & "Error " & Err & " " & Err.Description End If End If End If End Function ' Another method which looks for null characters: 'https://www.codeproject.com/Tips/672470/Simple-Character-Encoding-Detection Function ReadADOStreamText(ByVal filename As String, ByRef text As String, _ Optional ByVal charset As String = "windows-1252", Optional ByRef numchars As Long) As Long 'https://docs.microsoft.com/en-us/sql/ado/reference/ado-api/stream-object-properties-methods-and-events 'numchars Optional. The number of characters to read from the file, _ or a StreamReadEnum value. Default is adReadAll=-1 Dim oStream As ADODB.Stream Set oStream = New ADODB.Stream ' late: CreateObject("ADODB.Stream") With oStream .Type = adTypeText 'adTypeText = 2 Specify stream type - text/string data. '.Mode = adModeRead ' don't specify this or .LoadFromFile gets 'Err 3219 Operation is not allowed in this context. 'see allowed encodings in registry HKEY_CLASSES_ROOT\MIME\Database\Charset If Len(charset) > 0 Then ' Do Not use " BOM" in charset. Needs hyphen, UTF-8 not UTF8 .charset = charset ' case-insensitive, default value is "us-ascii" End If text = vbNullString ' in case of err ' comment out handler to let caller handle any error On Error GoTo OnError ' to show an error message, or use On Error Resume Next to avoid Messagebox .Open .LoadFromFile filename If numchars <= 0 Then ' -1 or 0, whole file, in chunks PRB KB280067 recommends 128K, 2^17 Do While Not .EOS text = text & .ReadText(262144) ' 256K Loop Else text = .ReadText(numchars) End If .Close End With Set oStream = Nothing ExitProc: ReadADOStreamText = Err.Number ' in case On Error Resume Next was used Exit Function OnError: MessageBox Err & " " & Err.Description & vbLf & filename, vbOKOnly, "ReadADOStreamBytes" Resume ExitProc End Function Function WriteADOStreamText(ByVal filename As String, ByRef text As String, _ Optional ByRef charset As String) As Long If Len(charset) = 0 Then charset = "windows-1252" End If With CreateObject("ADODB.Stream") .Type = 2 'adTypeText = 2 Specify stream type - text/string data. 'see allowed in HKEY_CLASSES_ROOT\MIME\Database\Charset .charset = charset ' case-insensitive, default value is "us-ascii" .Open .WriteText text .SaveToFile filename, 2 ' adSaveCreateNotExist=1 = no overwrite, adSaveCreateOverWrite=2 = overwrite .Close WriteADOStreamText = Err.Number End With End Function ' reading bytes with no charset, just 1 char per byte Function ReadADOStreamBytes(ByVal filename As String, ByRef bytes() As Byte, _ Optional ByRef numchars As Long = -1) As Long 'https://docs.microsoft.com/en-us/sql/ado/reference/ado-api/stream-object-properties-methods-and-events 'numchars Optional. The number of characters to read from the file, _ or a StreamReadEnum value. Default is adReadAll=-1 Dim oStream As ADODB.Stream ' set reference to Microsoft ActiveX Data Objects Set oStream = New ADODB.Stream ' late binding: CreateObject("ADODB.Stream") With oStream .Type = adTypeBinary '=1 adTypeText = 2 Specify stream type '.Mode = adModeRead 'see allowed in HKEY_CLASSES_ROOT\MIME\Database\Charset On Error GoTo OnError ' uncomment this for local error handling 'Err 3002 File could not be opened. .Open .LoadFromFile filename ' err 3219 if adModeRead specified bytes = .Read(numchars) .Close End With Set oStream = Nothing ExitProc: ReadADOStreamBytes = Err.Number Exit Function OnError: MessageBox Err & " " & Err.Description & vbLf & filename, vbOKOnly, "ReadADOStreamBytes" Resume ExitProc End Function Function WriteADOStreamBytes(ByVal filename As String, ByRef bytes() As Byte) As Long With CreateObject("ADODB.Stream") .Type = adTypeBinary '.charset not allowed in binary context .Open .Write bytes .SaveToFile filename, 2 ' adSaveCreateNotExist=1 = no overwrite, adSaveCreateOverWrite=2 = overwrite .Close WriteADOStreamBytes = Err.Number End With End Function