Remove all Unicode Control Characters and special control characters fast in C-Sharp

Tuesday, August 6, 2019

Remove all Unicode Control Characters and special control characters fast in C-Sharp

Here's how to remove all Unicode control characters from a string fast in C-Sharp, strictly speaking.

However, you may consider Unicode Character 'RIGHT-TO-LEFT OVERRIDE' (U+202E) notionally as a Control Character, but is official categorized as General Punctuation.

In code example below isSpecialUnicodeCntrlChr() method removes right-to-left and left-to-right characters.

Take a look at these Unicode characters and decide what you consider a control character
https://unicode-table.com/en/blocks/general-punctuation/

Updated Thu 21-Nov-24 11:45pm EST MDC
updated to use Regex to properly remove all Unicode control characters.

using System;using System.Text;using System.Diagnostics;using System.Collections.Generic;using System.Text.RegularExpressions;
					
public static class Program
{
	//https://unicode-table.com/en/blocks/general-punctuation/ add some special cases
	public static bool isSpecialUnicodeCntrlChr(this Char c)
    {
		//remove left-to-rights and right-to-lefts
        switch (c)
        {
            case '\u200E': //Left-To-Right Mark
            case '\u200F': //Right-To-Left Mark
			case '\u202A': //Left-To-Right Embedding
			case '\u202B': //Right-To-Left Embedding
			case '\u202D': //Left-To-Right Override
			case '\u202E': //Right-To-Left Override
			case '\u2066': //Left-To-Right Isolate
			case '\u2067': //Right-To-Left Isolate
			
			//https://unicode-table.com/en/blocks/general-punctuation/ add more 
			//case '\u2060': //Word Joiner
			//etc....	
                return true; 
            default:
                return false;
        }
	}
	
	public static string RemoveUnicodeControlChars(this string s) {
		
		StringBuilder sb = new StringBuilder(s.Length);
		for (int i = 0; i < s.Length; i++) 
			if ( !Char.IsControl(s[i]) && !s[i].isSpecialUnicodeCntrlChr() )
				sb.Append(s[i]);
			
    	
		return sb.ToString(); 
	}
	
	// create a lookup hashset
	private static HashSet<char> specialUnicodeCtrlChr = new HashSet<char>(new char[] {'\u200E','\u200F','\u202A','\u202B','\u202D','\u202E', '\u2066', '\u2067'} );

	public static string FilterUnicodeControlChars(this string str)
	{
		// tempbuffer
		char[] buffer = new char[str.Length];
		int index = 0;

		// check each character
		foreach (var ch in str)
			if ( !Char.IsControl(ch) && !specialUnicodeCtrlChr.Contains(ch))
				buffer[index++] = ch;

		// return the new string.
		return new String(buffer, 0, index);
	}
	
	//speed optimization
    const string pattern = @"\p{C}";
    const RegexOptions options = RegexOptions.Multiline;
    private static readonly Regex regRemoveAllUni = new Regex(pattern, options);

	
	public static void Main()
	{
		Stopwatch sw = new Stopwatch(); 
		Console.WriteLine("Hungarian\bGrand\t\t\r\vPrix\u202EF1");
		sw.Start();
		Console.Write("Hungarian\bGrand\t\t\r\vPrix\u202EF1".RemoveUnicodeControlChars());
		sw.Stop(); 
		Console.WriteLine(" in {0} ticks.",sw.ElapsedTicks );
		Console.WriteLine();
        sw.Reset();
		sw.Start();
		Console.WriteLine("ŐhᢰHung\u2063arian\u008D\bGrand\t\t\r\vPrix\u202EF1".RemoveUnicodeControlChars());
		sw.Stop(); 
		Console.WriteLine(" in {0} ticks.",sw.ElapsedTicks );
		Console.WriteLine();
		Console.WriteLine();
		Console.WriteLine("Using HashSet Filtering");
		sw.Start();
		Console.Write("Hungarian\bGrand\t\t\r\vPrix\u202EF1".FilterUnicodeControlChars());
		sw.Stop(); 
		Console.WriteLine(" in {0} ticks.",sw.ElapsedTicks );
		Console.WriteLine();
        sw.Reset();
		sw.Start();
		Console.WriteLine("ŐhᢰHung\u2063arian\u008D\bGrand\t\t\r\vPrix\u202EF1".FilterUnicodeControlChars());
		sw.Stop(); 
		Console.WriteLine(" in {0} ticks.",sw.ElapsedTicks );

		//TO REMOVE ALL UNICODE TEXT PROPERLY //Thu 21-Nov-24 11:35pm EST MDC - 

        sw.Start();
	
    	Console.WriteLine("\n\nUnicode Regex to remove all Control chars"); 
		Console.WriteLine(regRemoveAllUni.Replace("ŐhᢰHung\u2063arian\u008D\bGrand\t\t\r\vPrix\u202EF1", ""));
		sw.Stop(); 
		Console.WriteLine(" in {0} ticks.",sw.ElapsedTicks );
		

		
	}
}

Metadata Consulting [dot] ca - Blog

Pages

Tuesday, August 6, 2019

Remove all Unicode Control Characters and special control characters fast in C-Sharp

No comments:

Post a Comment

Search and do some good