Here's how to remove all Unicode control characters from a string fast in C-Sharp, strictly speaking.
However, you may consider Unicode Character 'RIGHT-TO-LEFT OVERRIDE' (U+202E) notionally as a Control Character, but is official categorized as General Punctuation.
In code example below isSpecialUnicodeCntrlChr() method removes right-to-left and left-to-right characters.
Take a look at these Unicode characters and decide what you consider a control character
https://unicode-table.com/en/blocks/general-punctuation/
Updated Thu 21-Nov-24 11:45pm EST MDC
updated to use Regex to properly remove all Unicode control characters.
using System;using System.Text;using System.Diagnostics;using System.Collections.Generic;using System.Text.RegularExpressions; public static class Program { //https://unicode-table.com/en/blocks/general-punctuation/ add some special cases public static bool isSpecialUnicodeCntrlChr(this Char c) { //remove left-to-rights and right-to-lefts switch (c) { case '\u200E': //Left-To-Right Mark case '\u200F': //Right-To-Left Mark case '\u202A': //Left-To-Right Embedding case '\u202B': //Right-To-Left Embedding case '\u202D': //Left-To-Right Override case '\u202E': //Right-To-Left Override case '\u2066': //Left-To-Right Isolate case '\u2067': //Right-To-Left Isolate //https://unicode-table.com/en/blocks/general-punctuation/ add more //case '\u2060': //Word Joiner //etc.... return true; default: return false; } } public static string RemoveUnicodeControlChars(this string s) { StringBuilder sb = new StringBuilder(s.Length); for (int i = 0; i < s.Length; i++) if ( !Char.IsControl(s[i]) && !s[i].isSpecialUnicodeCntrlChr() ) sb.Append(s[i]); return sb.ToString(); } // create a lookup hashset private static HashSet<char> specialUnicodeCtrlChr = new HashSet<char>(new char[] {'\u200E','\u200F','\u202A','\u202B','\u202D','\u202E', '\u2066', '\u2067'} ); public static string FilterUnicodeControlChars(this string str) { // tempbuffer char[] buffer = new char[str.Length]; int index = 0; // check each character foreach (var ch in str) if ( !Char.IsControl(ch) && !specialUnicodeCtrlChr.Contains(ch)) buffer[index++] = ch; // return the new string. return new String(buffer, 0, index); } //speed optimization const string pattern = @"\p{C}"; const RegexOptions options = RegexOptions.Multiline; private static readonly Regex regRemoveAllUni = new Regex(pattern, options); public static void Main() { Stopwatch sw = new Stopwatch(); Console.WriteLine("Hungarian\bGrand\t\t\r\vPrix\u202EF1"); sw.Start(); Console.Write("Hungarian\bGrand\t\t\r\vPrix\u202EF1".RemoveUnicodeControlChars()); sw.Stop(); Console.WriteLine(" in {0} ticks.",sw.ElapsedTicks ); Console.WriteLine(); sw.Reset(); sw.Start(); Console.WriteLine("ŐhᢰHung\u2063arian\u008D\bGrand\t\t\r\vPrix\u202EF1".RemoveUnicodeControlChars()); sw.Stop(); Console.WriteLine(" in {0} ticks.",sw.ElapsedTicks ); Console.WriteLine(); Console.WriteLine(); Console.WriteLine("Using HashSet Filtering"); sw.Start(); Console.Write("Hungarian\bGrand\t\t\r\vPrix\u202EF1".FilterUnicodeControlChars()); sw.Stop(); Console.WriteLine(" in {0} ticks.",sw.ElapsedTicks ); Console.WriteLine(); sw.Reset(); sw.Start(); Console.WriteLine("ŐhᢰHung\u2063arian\u008D\bGrand\t\t\r\vPrix\u202EF1".FilterUnicodeControlChars()); sw.Stop(); Console.WriteLine(" in {0} ticks.",sw.ElapsedTicks ); //TO REMOVE ALL UNICODE TEXT PROPERLY //Thu 21-Nov-24 11:35pm EST MDC - sw.Start(); Console.WriteLine("\n\nUnicode Regex to remove all Control chars"); Console.WriteLine(regRemoveAllUni.Replace("ŐhᢰHung\u2063arian\u008D\bGrand\t\t\r\vPrix\u202EF1", "")); sw.Stop(); Console.WriteLine(" in {0} ticks.",sw.ElapsedTicks ); } }
No comments:
Post a Comment