Here's how to remove all Unicode control characters from a string fast in C-Sharp, strictly speaking.
However, you may consider Unicode Character 'RIGHT-TO-LEFT OVERRIDE' (U+202E) notionally as a Control Character, but is official categorized as General Punctuation.
In code example below isSpecialUnicodeCntrlChr() method removes right-to-left and left-to-right characters.
Take a look at these Unicode characters and decide what you consider a control character
https://unicode-table.com/en/blocks/general-punctuation/
However, you may consider Unicode Character 'RIGHT-TO-LEFT OVERRIDE' (U+202E) notionally as a Control Character, but is official categorized as General Punctuation.
In code example below isSpecialUnicodeCntrlChr() method removes right-to-left and left-to-right characters.
Take a look at these Unicode characters and decide what you consider a control character
https://unicode-table.com/en/blocks/general-punctuation/
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | using System;using System.Text; public static class Program { //https://unicode-table.com/en/blocks/general-punctuation/ add some special cases public static bool isSpecialUnicodeCntrlChr(this Char c) { //remove left-to-rights and right-to-lefts switch (c) { case '\u200E': //Left-To-Right Mark case '\u200F': //Right-To-Left Mark case '\u202A': //Left-To-Right Embedding case '\u202B': //Right-To-Left Embedding case '\u202D': //Left-To-Right Override case '\u202E': //Right-To-Left Override case '\u2066': //Left-To-Right Isolate case '\u2067': //Right-To-Left Isolate //https://unicode-table.com/en/blocks/general-punctuation/ add more //case '\u2060': //Word Joiner //etc.... return true; default: return false; } } public static string RemoveUnicodeControlChars(this string s) { StringBuilder sb = new StringBuilder(s.Length); for (int i = 0; i < s.Length; i++) if ( !Char.IsControl(s[i]) && !s[i].isSpecialUnicodeCntrlChr() ) sb.Append(s[i]); return sb.ToString(); } public static void Main() { Console.WriteLine("Hungarian\bGrand\t\t\r\vPrix\u202EF1"); Console.WriteLine("Hungarian\bGrand\t\t\r\vPrix\u202EF1".RemoveUnicodeControlChars()); Console.WriteLine(); Console.WriteLine("ŐhᢰHung\u2063arian\u008D\bGrand\t\t\r\vPrix\u202EF1"); Console.WriteLine("ŐhᢰHung\u2063arian\u008D\bGrand\t\t\r\vPrix\u202EF1".RemoveUnicodeControlChars()); } } |
No comments:
Post a Comment