Monday, November 16, 2020

C# .NET How to remove blank lines from a string faster, dealing with null '\0' character confusion

Typically, in C# code you would use a 

str.Split(TrimNewLineChars, StringSplitOptions.RemoveEmptyEntries);

to remove empty white-space lines and is effective. But a faster way is to use StringReader and process each line. 

But there are gotcha's introduced when processing null character ('\0') which you might deduce from the above to be a white-space character. But in fact, in C# '\u0000' is a null character, but has no special meaning in C#. It just a null character in a string. It is considered
not white-space but a control character. It's not considered a null either or string terminator as it is in C. To view control characters in VS Code, see my blog post and in Notepad++ see post.

In fact, you can look a the .NET internal storage of a string using, see my next post on this.

using (var writer = new StringWriter())
{
    using (var provider = CodeDomProvider.CreateProvider("CSharp"))
    {
        provider.GenerateCodeFromExpression(new CodePrimitiveExpression(input), writer, null);

        return writer.ToString();
    }
}

TLDR / Lesson Learning :

Counter-intuitively 
StringSplitOptions.RemoveEmptyEntries considers null character ('\0') whitespace, not a control character.




Source Code

using System;
using System.Text; 
using System.IO;
using System.Diagnostics;
using System.Text.RegularExpressions;
					
    public static class Program
    {

        public static string RemoveAllWhitespace(this string str)
        {
            var len = str.Length;
            var src = str.ToCharArray();
            var dstIdx = 0;
            for (var i = 0; i < len; i++)
            {
                char ch = src[i];
                if (!char.IsWhiteSpace(ch) && ch != '\0')
                    //ch!='\0')
                    src[dstIdx++] = ch;
                     
            }
            return new string(src, 0, dstIdx);
        }


        public static string TrimStartUnicode(this string str)
        {
            var len = str.Length;
            var src = str.ToCharArray();
            var dstIdx = 0;
            for (var i = 0; i < len; i++)
            {
                char ch = src[i];
                if (!char.IsWhiteSpace(ch) && !char.IsControl(ch) ) 
                {
                    src[dstIdx++] = ch;
                    break;
                }
            }
            return new string(src, 0, dstIdx);
        }

        private static readonly char[] TrimNewLineChars = Environment.NewLine.ToCharArray();
        public static string RemoveEmptyLines(this string str)
        {
            if (str == null)
            {
                return null;
            }
            var lines = str.Split(TrimNewLineChars, StringSplitOptions.RemoveEmptyEntries);

            var sb = new StringBuilder(str.Length);
            foreach (var line in lines)
            {
                if (!String.IsNullOrWhiteSpace(line))
                    sb.AppendLine(line);
            }

            return sb.ToString();
        }

        //Tue 12-May-20 2:08am  - 
        public static String RemoveAllBlankLinesIssue(this string value)
        {

            StringBuilder output = new StringBuilder(value.Length);
            using (StringReader sr = new StringReader(value))
            {
                string line;
                while ((line = sr.ReadLine()) != null)
                {
					//\0 has no special meaning in c# it's just a null character contained in a string.
                    if (line == '\0'.ToString()) //is line length of 1
                    {

                        Console.WriteLine("is char '\\0' empty or null = " + string.IsNullOrEmpty('\0'.ToString()));
                        Console.WriteLine("is char '\\0' whitespace or null = " + String.IsNullOrWhiteSpace('\0'.ToString()));
						Console.WriteLine("is char '\\0' char.IsWhiteSpace = " + char.IsWhiteSpace('\0')); 
						Console.WriteLine("is char '\\0' char.IsControl = " + char.IsControl('\0')); 
						
                    }
                    if (line.Contains('\u0080'.ToString()))
                    {
                        Console.WriteLine("is char '\\0080' empty or null = " + string.IsNullOrEmpty("\u0080").ToString());
                        Console.WriteLine("is char '\\0080' whitespace or null = " + String.IsNullOrWhiteSpace("\u0080").ToString());

                    }

                    if (!String.IsNullOrWhiteSpace(line) && !string.IsNullOrEmpty(line))
                        output.AppendLine(line);
                }

            }
            return output.ToString();
        }

        //Tue 12-May-20 2:08am  - 
        public static String RemoveAllBlankLinesFinal(this string value)
        {
                         
            StringBuilder output = new StringBuilder(value.Length);
            using (StringReader sr = new StringReader(value))
            {
                string line;
                string temp; 
                while ((line = sr.ReadLine()) != null)
                {
                    temp = line.TrimStartUnicode();  
                    
                    if (!string.IsNullOrWhiteSpace(temp) && !string.IsNullOrEmpty(temp))
                        output.AppendLine(line);
                }

            }
            return output.ToString();
        }



        public static String RemoveAllBlankLinesRegex(this string s)
        {
            return Regex.Replace(s, @"^\s+$[\r\n]*", string.Empty, RegexOptions.Multiline);
            //return Regex.Replace(s, @"^(?:[\t ]*(?:\r?\n|\r))+", string.Empty, RegexOptions.Multiline); 
            //return Regex.Replace(s, @"(?<=(?:\r?\n){2}|\A)(?:\r?\n)+", string.Empty, RegexOptions.Multiline); 
            //return Regex.Replace(s, @"(?<=(?:\r?\n){2}$\w)(?:\r?\n)+", string.Empty, RegexOptions.Multiline); 
            //return Regex.Replace(s, @"^\s*(\r\n|\V)", string.Empty, RegexOptions.Multiline); //does not work

        }

        public static void Main()
        {
            string output = string.Empty;

            Stopwatch sw = new Stopwatch();

            string emptytest = "Tell me and I forget.\n \n     \nTeach me and I remember.     \r\n \r\n\r\nInvolve me and I learn.  \r     \r\r\0\r\r   Pad Unicode \\u0080 next line\n\n\u0080\r\rby Benjamin Franklin.\r\n";

            sw.Start();
            output = emptytest.RemoveEmptyLines();
            sw.Stop();
            Console.WriteLine(output + " in " + sw.ElapsedTicks + " ticks");

            Console.WriteLine();
            Console.WriteLine("-------- StringReader Issue ----------------");
            sw.Reset();
            sw.Start();
            output = emptytest.RemoveAllBlankLinesIssue();
            sw.Stop();
            Console.WriteLine(output + " in " + sw.ElapsedTicks + " ticks");


            Console.WriteLine();
            Console.WriteLine("-------- StringReader Final ----------------");
            sw.Reset();
            sw.Start();
            output = emptytest.RemoveAllBlankLinesFinal();
            sw.Stop();
            Console.WriteLine(output + " in " + sw.ElapsedTicks + " ticks");








            Console.WriteLine();
            Console.WriteLine("-------- Regex ----------------");

            sw.Reset();
            sw.Start();
            output = emptytest.RemoveAllBlankLinesRegex();
            sw.Stop();

            Console.WriteLine(output + "\n in " + sw.ElapsedTicks + " ticks");

            

        }
    }

No comments:

Post a Comment