str.Split(TrimNewLineChars, StringSplitOptions.RemoveEmptyEntries);
to remove empty white-space lines and is effective. But a faster way is to use StringReader and process each line.
But there are gotcha's introduced when processing null character ('\0') which you might deduce from the above to be a white-space character. But in fact, in C# '\u0000' is a null character, but has no special meaning in C#. It just a null character in a string. It is considered not white-space but a control character. It's not considered a null either or string terminator as it is in C. To view control characters in VS Code, see my blog post and in Notepad++ see post.
In fact, you can look a the .NET internal storage of a string using, see my next post on this.
using (var writer = new StringWriter()) { using (var provider = CodeDomProvider.CreateProvider("CSharp")) { provider.GenerateCodeFromExpression(new CodePrimitiveExpression(input), writer, null); return writer.ToString(); } }
TLDR / Lesson Learning :
Counter-intuitively StringSplitOptions.RemoveEmptyEntries considers null character ('\0') whitespace, not a control character.
Counter-intuitively StringSplitOptions.RemoveEmptyEntries considers null character ('\0') whitespace, not a control character.
Source Code
using System; using System.Text; using System.IO; using System.Diagnostics; using System.Text.RegularExpressions; public static class Program { public static string RemoveAllWhitespace(this string str) { var len = str.Length; var src = str.ToCharArray(); var dstIdx = 0; for (var i = 0; i < len; i++) { char ch = src[i]; if (!char.IsWhiteSpace(ch) && ch != '\0') //ch!='\0') src[dstIdx++] = ch; } return new string(src, 0, dstIdx); } public static string TrimStartUnicode(this string str) { var len = str.Length; var src = str.ToCharArray(); var dstIdx = 0; for (var i = 0; i < len; i++) { char ch = src[i]; if (!char.IsWhiteSpace(ch) && !char.IsControl(ch) ) { src[dstIdx++] = ch; break; } } return new string(src, 0, dstIdx); } private static readonly char[] TrimNewLineChars = Environment.NewLine.ToCharArray(); public static string RemoveEmptyLines(this string str) { if (str == null) { return null; } var lines = str.Split(TrimNewLineChars, StringSplitOptions.RemoveEmptyEntries); var sb = new StringBuilder(str.Length); foreach (var line in lines) { if (!String.IsNullOrWhiteSpace(line)) sb.AppendLine(line); } return sb.ToString(); } //Tue 12-May-20 2:08am - public static String RemoveAllBlankLinesIssue(this string value) { StringBuilder output = new StringBuilder(value.Length); using (StringReader sr = new StringReader(value)) { string line; while ((line = sr.ReadLine()) != null) { //\0 has no special meaning in c# it's just a null character contained in a string. if (line == '\0'.ToString()) //is line length of 1 { Console.WriteLine("is char '\\0' empty or null = " + string.IsNullOrEmpty('\0'.ToString())); Console.WriteLine("is char '\\0' whitespace or null = " + String.IsNullOrWhiteSpace('\0'.ToString())); Console.WriteLine("is char '\\0' char.IsWhiteSpace = " + char.IsWhiteSpace('\0')); Console.WriteLine("is char '\\0' char.IsControl = " + char.IsControl('\0')); } if (line.Contains('\u0080'.ToString())) { Console.WriteLine("is char '\\0080' empty or null = " + string.IsNullOrEmpty("\u0080").ToString()); Console.WriteLine("is char '\\0080' whitespace or null = " + String.IsNullOrWhiteSpace("\u0080").ToString()); } if (!String.IsNullOrWhiteSpace(line) && !string.IsNullOrEmpty(line)) output.AppendLine(line); } } return output.ToString(); } //Tue 12-May-20 2:08am - public static String RemoveAllBlankLinesFinal(this string value) { StringBuilder output = new StringBuilder(value.Length); using (StringReader sr = new StringReader(value)) { string line; string temp; while ((line = sr.ReadLine()) != null) { temp = line.TrimStartUnicode(); if (!string.IsNullOrWhiteSpace(temp) && !string.IsNullOrEmpty(temp)) output.AppendLine(line); } } return output.ToString(); } public static String RemoveAllBlankLinesRegex(this string s) { return Regex.Replace(s, @"^\s+$[\r\n]*", string.Empty, RegexOptions.Multiline); //return Regex.Replace(s, @"^(?:[\t ]*(?:\r?\n|\r))+", string.Empty, RegexOptions.Multiline); //return Regex.Replace(s, @"(?<=(?:\r?\n){2}|\A)(?:\r?\n)+", string.Empty, RegexOptions.Multiline); //return Regex.Replace(s, @"(?<=(?:\r?\n){2}$\w)(?:\r?\n)+", string.Empty, RegexOptions.Multiline); //return Regex.Replace(s, @"^\s*(\r\n|\V)", string.Empty, RegexOptions.Multiline); //does not work } public static void Main() { string output = string.Empty; Stopwatch sw = new Stopwatch(); string emptytest = "Tell me and I forget.\n \n \nTeach me and I remember. \r\n \r\n\r\nInvolve me and I learn. \r \r\r\0\r\r Pad Unicode \\u0080 next line\n\n\u0080\r\rby Benjamin Franklin.\r\n"; sw.Start(); output = emptytest.RemoveEmptyLines(); sw.Stop(); Console.WriteLine(output + " in " + sw.ElapsedTicks + " ticks"); Console.WriteLine(); Console.WriteLine("-------- StringReader Issue ----------------"); sw.Reset(); sw.Start(); output = emptytest.RemoveAllBlankLinesIssue(); sw.Stop(); Console.WriteLine(output + " in " + sw.ElapsedTicks + " ticks"); Console.WriteLine(); Console.WriteLine("-------- StringReader Final ----------------"); sw.Reset(); sw.Start(); output = emptytest.RemoveAllBlankLinesFinal(); sw.Stop(); Console.WriteLine(output + " in " + sw.ElapsedTicks + " ticks"); Console.WriteLine(); Console.WriteLine("-------- Regex ----------------"); sw.Reset(); sw.Start(); output = emptytest.RemoveAllBlankLinesRegex(); sw.Stop(); Console.WriteLine(output + "\n in " + sw.ElapsedTicks + " ticks"); } }
No comments:
Post a Comment