Friday, March 26, 2021

C# .NET - How to get the proper length of a Unicode string

That's Italian for ruler 

Here's how to get the proper length of a Unicode string, which is not the same a standard string length. 

A couple of ways are shown.



Source Code
using System;
using System.Globalization;
using System.Diagnostics;
                    
public class Program
{
    public static void Main()
    {
        // 140字以下かどうかのチェックを行い超えている場合はconfigに従って動作する
        //🎶🔥é- is standard length of 6, but there are ONLY 4 characters! Why not len=4?
        //🎶🔥 are double byte UNICODE characters (> \u10000) of width or len 2 each 
        //🎶🔥é- below will replace space after lasting character '-' (position 4) with a sub using most common techniques seen online
        
        string s = "𠇰😈🎶🔥é-"; 
		string s2 = "𠇰😈🎶🔥é-"; 
        Stopwatch sw = new Stopwatch();
        
        sw.Start();
        int typicalen = s.Length;
        sw.Stop();
        Console.WriteLine("Normal String {0} len = {1} in {2} ticks.",s, typicalen, sw.ElapsedTicks.ToString("N0"));
        
        StringInfo stringInfoThrowAway = new StringInfo("Preload this function"); //Results vary if you move this before or after below implementations
        
        sw.Reset();
        sw.Start();
        StringInfo stringInfo = new StringInfo(s2);
        int stringInfoLength = stringInfo.LengthInTextElements;
        sw.Stop();
        Console.WriteLine("new StringInfo {0} len = {1} in {2} ticks.",s,stringInfoLength, sw.ElapsedTicks.ToString("N0"));
        
        
        sw.Reset();
        sw.Start();
        int[] textElemIndex = StringInfo.ParseCombiningCharacters(s);
        int tEILength = textElemIndex.Length; 
        sw.Stop();
        Console.WriteLine("ParseCombiningCharacters String {0} len = {1} in {2} ticks.",s,tEILength, sw.ElapsedTicks.ToString("N0"));
        
        sw.Reset();
        sw.Start();
        // Use the enumerator returned from GetTextElementEnumerator 
        // method to examine each real character.
        TextElementEnumerator charEnum = StringInfo.GetTextElementEnumerator(s);
        int i = 0;
        while (charEnum.MoveNext())
            {
                i++;   
            }
        sw.Stop();
		
		Console.WriteLine("GetTextElementEnumerator String {0} len = {1} in {2} ticks.",s,i, sw.ElapsedTicks.ToString("N0"));
		
		sw.Reset();
        sw.Start();
        char[] charsInString = s.ToCharArray();
        int charLength = s.Length; 
        sw.Stop();
		
        Console.WriteLine("ToCharArray {0} len = {1} in {2} ticks.",s,charLength, sw.ElapsedTicks.ToString("N0"));
        
    }
}

2 comments:

  1. char[] charsInString = s.ToCharArray();
    int charLength = textElemIndex.Length;

    isn't the second line should be
    int charLength = charsInString.Length;

    which result in a len = 10

    ReplyDelete
    Replies
    1. You are absolutely correct, updated post. Thanks Iibin for the eyes. I'm going code blind - ὃ

      Delete