Here's how to reduce a string to ASCII and remove control characters from a string fast in C-Sharp. But be careful since, remove é is not replaces with e. Todo that you need normalize the string, see UnicodetoAscii function.
ASCII (American Standard Code for Information Interchange) is a 7-bit character set that contains characters from 0 to 127.
The generic term ANSI (American National Standards Institute) is used for 8-bit character sets. These character sets contain the unchanged ASCII character set. In addition, they contain further characters from 128 to 255.
Here's a list of control characters. https://unicode-table.com/en/blocks/general-punctuation/
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | using System; using System.Text; using System.Linq; using System.Diagnostics; public static class Program { // Based on http://www.codeproject.com/Articles/13503/Stripping-Accents-from-Latin-Characters-A-Foray-in // Proper Normalization public static string UnicodeToAscii(this string s) { var sb = new StringBuilder(s.Length); sb.Append(s.Normalize(NormalizationForm.FormKD) .Where(x => (x > 30 && x < 128)) .ToArray()); return sb.ToString(); } //ANSI characters 32 to 127 correspond to those in the 7-bit ASCII character set, public static string ReducetoASCII(this string s) { StringBuilder sb = new StringBuilder(s.Length); foreach (char c in s) { if ((int)c > 127) // remove chars > 127 continue; if ((int)c < 32) // remove control characters continue; sb.Append(c); } return sb.ToString(); } public static void Main() { Stopwatch sw = new Stopwatch(); string french = "A Paris, le cortège parisien s’était élancé à 14 heures.\r\n\tFace à l’affluence, un «itinéraire bis» a été mis en place. D’importants rassemblements ont lieu à Bordeaux, Marseille, Rennes ou Lyon. Suivez la journée avec nos journalistes dans toute la France."; string ftemp = string.Empty; string german = "Trump\t\r\nverwechselt Klägerin Carroll auf Foto mit Ex-Frau – das könnte Folgen haben"; string gtemp = string.Empty; Console.WriteLine(french); sw.Start(); ftemp = french.ReducetoASCII(); sw.Stop(); Console.WriteLine("Ansi reduced\r\n" + ftemp + " in " + sw.ElapsedTicks); sw.Reset(); sw.Start(); ftemp = french.UnicodeToAscii(); sw.Stop(); Console.WriteLine("Proper Normalization\r\n" + ftemp + " in " + sw.ElapsedTicks); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(german); sw.Reset(); sw.Start(); gtemp = german.ReducetoASCII(); sw.Stop(); Console.WriteLine("Ansi reduced\r\n" + gtemp + " in " + sw.ElapsedTicks); sw.Reset(); sw.Start(); gtemp = german.UnicodeToAscii(); sw.Stop(); Console.WriteLine("Proper Normalization\r\n" + gtemp + " in " + sw.ElapsedTicks); } } |
No comments:
Post a Comment