Tuesday, August 6, 2019

Remove all Unicode Control Characters and special control characters fast in C-Sharp

Here's how to remove all Unicode control characters from a string fast in C-Sharp, strictly speaking. 

However, you may consider Unicode Character 'RIGHT-TO-LEFT OVERRIDE' (U+202E) notionally as a Control Character, but is official categorized as General Punctuation. 

In code example below isSpecialUnicodeCntrlChr() method removes right-to-left and left-to-right characters.

Take a look at these Unicode characters and decide what you consider a control character
https://unicode-table.com/en/blocks/general-punctuation/


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
using System;using System.Text;
     
public static class Program
{
 //https://unicode-table.com/en/blocks/general-punctuation/ add some special cases
 public static bool isSpecialUnicodeCntrlChr(this Char c)
    {
  //remove left-to-rights and right-to-lefts
        switch (c)
        {
            case '\u200E': //Left-To-Right Mark
            case '\u200F': //Right-To-Left Mark
   case '\u202A': //Left-To-Right Embedding
   case '\u202B': //Right-To-Left Embedding
   case '\u202D': //Left-To-Right Override
   case '\u202E': //Right-To-Left Override
   case '\u2066': //Left-To-Right Isolate
   case '\u2067': //Right-To-Left Isolate
   
   //https://unicode-table.com/en/blocks/general-punctuation/ add more 
   //case '\u2060': //Word Joiner
   //etc.... 
                return true; 
            default:
                return false;
        }
 }
 
 public static string RemoveUnicodeControlChars(this string s) {
  
  StringBuilder sb = new StringBuilder(s.Length);
  for (int i = 0; i < s.Length; i++) 
   if ( !Char.IsControl(s[i]) && !s[i].isSpecialUnicodeCntrlChr() )
    sb.Append(s[i]);
   
     
  return sb.ToString(); 
 }
 
 public static void Main()
 {
  Console.WriteLine("Hungarian\bGrand\t\t\r\vPrix\u202EF1");
  Console.WriteLine("Hungarian\bGrand\t\t\r\vPrix\u202EF1".RemoveUnicodeControlChars());
  Console.WriteLine();
  Console.WriteLine("ŐhᢰHung\u2063arian\u008D\bGrand\t\t\r\vPrix\u202EF1"); 
  Console.WriteLine("ŐhᢰHung\u2063arian\u008D\bGrand\t\t\r\vPrix\u202EF1".RemoveUnicodeControlChars());
  
 }
}

No comments:

Post a Comment