Updated - //Thu 10-Mar-23
Hello C# Developers, here's a quick code share.
To start with, critically all C# strings are encoded using UTF-16. The function below is reducing the character mapping space to UTF-8. We are going to loose some characters here possibly, when you pump you string through this function.
Problem: Recently, I was working a C# code solution and getting an code fault with StringBuilder.ToString() which causing an EncoderFallbackException error:
"Unable to translate Unicode character \uxxxx at index xxx to specified code page."
Clearly I was attempting to write characters outside the UTF-8 character range, which I did not want. I just wanted a clean UTF-8 character string. I could catch EncoderFallbackException and clean but I want to avoid this altogether and not interested in results there.
Requirment: So I needed a solution to convert StringBuilder to "clean" UTF-8 then assemble a string, before using the StringBuilder.ToString()function. "Clean" meaning strip all characters outside the UTF-8 range.
Issue: Most solutions on Stackoverflow and MSDN convert Stringbuilder to string using (
StringBuilder.ToString()function and then convert to UTF-8, very common.
Solution:
So here is my working code for StringBuildertoUTF8String(this StringBuilder sb) class that converts Stringbuilder to UTF-8 String using string extensions template.
using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace StringBuilderToUTF8 { public static class StringExtensions { //set for UTF-8 encoding, no errors private static readonly Encoding Utf8Encoder = UTF8Encoding.GetEncoding("UTF-8", new EncoderReplacementFallback(string.Empty), new DecoderExceptionFallback()); private static readonly Encoding Utf16Encoder = UTF8Encoding.GetEncoding("UTF-16", new EncoderReplacementFallback(string.Empty), new DecoderExceptionFallback()); private static readonly Encoding ISO88591Encoder = UTF8Encoding.GetEncoding("ISO-8859-1", new EncoderReplacementFallback(string.Empty), new DecoderExceptionFallback()); public static string toUTF8String(this StringBuilder sb) //incomming string is UTF-16 { byte[] bytesUTF16 = Utf16Encoder.GetBytes(sb.ToString()); char[] charsUTF16 = Utf16Encoder.GetChars(bytesUTF16); byte[] bytesUTF8 = Utf8Encoder.GetBytes(sb.ToString()); char[] charsUTF8 = Utf8Encoder.GetChars(bytesUTF8); byte[] bytesISO88591 = ISO88591Encoder.GetBytes(sb.ToString()); char[] charsISO88591 = ISO88591Encoder.GetChars(bytesUTF16); byte[] bytes = Encoding.Default.GetBytes(sb.ToString()); //defaults to ANSI string char[] chars = Encoding.Default.GetChars(bytes); //get clean UTF-8 string from stringbuilder char[] //return Utf8Encoder.GetString(Utf8Encoder.GetBytes(charsISO88591)); //return Utf8Encoder.GetString(ISO88591Encoder..GetBytes(charsISO88591)); //return Encoding.Default.GetString(bytesISO88591, 0, bytesISO88591.Length); //return ISO88591Encoder.GetString(bytesISO88591, 0, bytesISO88591.Length); //Get UTF-8 string return Utf8Encoder.GetString(bytesUTF8, 0, bytesUTF8.Length); } public static string toISO88591String(this StringBuilder sb) { byte[] bytesISO88591 = ISO88591Encoder.GetBytes(sb.ToString()); return ISO88591Encoder.GetString(bytesISO88591, 0, bytesISO88591.Length); } public static string toUTF8StringFastCopy(this StringBuilder sb) { char[] bytes = new char[sb.Length]; //copies StringBuilder to [] sb.CopyTo(0, bytes, 0, sb.Length); //get clean UTF-8 string from stringbuilder char[] return Utf8Encoder.GetString(Utf8Encoder.GetBytes(bytes)); } // Based on http://www.codeproject.com/Articles/13503/Stripping-Accents-from-Latin-Characters-A-Foray-in // Proper Normalization //Wed 08-Feb-23 9:34am metadataconsulting.ca - faster public static string UnicodeToANSI(this string s) { var newStringBuilder = new StringBuilder(); newStringBuilder.Append(s.Normalize(NormalizationForm.FormKD) .Where(x => (x > 30 && x <= 255)) .ToArray()); return newStringBuilder.ToString(); } } class Program { static void Main(string[] args) { Console.OutputEncoding = Encoding.UTF8; //Search for “𤭢” code point, which is “U+24B62” //In case of a code points is over than U+10000, UTF16 encoding requires 2 bytes of 16 bits binary. We have to subtract 0x10000 from the code points. Now the remaining is “14B62” //Convert “14B62” to binary numbers. string utf16 = "Menü \uD852\uDF62 a\u0304\u0308 你好"; //The result of “𤭢” or “U+24B62” will be 0xD852 0xDF62 or \uD852\uDF62 //combining characters ("a\u0304\u0308" = "ā̈"), and surrogate pairs are interpretted StringBuilder sb = new StringBuilder(utf16); Console.WriteLine("Clean UTF8 only = " + utf16); Console.WriteLine("Clean ISO-8859-1 only = " + sb.toISO88591String()); Console.WriteLine("Clean UTF8 only = " + sb.toUTF8String()); Console.WriteLine("Clean UTF8 fast copy = " + sb.toUTF8StringFastCopy()); Console.ReadKey(); } } }
No comments:
Post a Comment