Pages

Saturday, September 5, 2020

C# .NET - How to get a string between two strings fast

Here's an indulgent look into getting a string between 2 strings. I could not find any satisfactory* solutions out there so I did my own. 

*Satisfactory because all the solution out there generally were not taking care of edge cases and did not state if there were maximal spanning (greedy in regex terms) or minimal spanning. 

So having a little extra time I decided to really look at this. I learnt that some edge cases I did not need to optimize this and make it fast, but bullet proof.


Note: This solution gets the maximal span between two strings. So 1st is search from beginning, and 2nd string is search from end.  
Minimal span would be 2nd string would be search from a position after 1st. 

See my post on minimal spanning 
https://metadataconsulting.blogspot.com/2020/09/C-NET-How-to-get-a-string-between-two-strings-minimal-spanning-fast.html




Code below implement a debug version of the method which shows which rules get triggered, in an extensive test cases.

  
Source Code
 
using System;using System.Diagnostics;

namespace GetBetweenStrings_Blog
{
    public static class Extensions 
    {
        /// <summary>
        /// Get a substring between two anchor strings, maximal span
        /// </summary>
        /// <param name="s">source string</param>
        /// <param name="from">search from end of this string</param>
        /// <param name="to">to beginning of this string, searching backwards, from end to start of s</param>
        /// <returns>a substring between from and to, maximal span</returns>
        public static string GetFirstStringBetweenStringsCleanup(this string s, string from, string to)
        {
            if (string.IsNullOrEmpty(s) || string.IsNullOrEmpty(from) || string.IsNullOrEmpty(to)) return string.Empty;
        
            int idxFrom = s.IndexOf(from);
            int idxStart = idxFrom + from.Length; //we filter "not found" -1, never get neg number here

            if (idxFrom == -1 || idxStart >= s.Length - 1)
			    return string.Empty;
            
            int idxEnd = s.LastIndexOf(to);
            
            if (idxEnd == -1 || idxEnd <= idxStart) 
                return string.Empty;
            
            return s.Substring(idxStart, idxEnd - idxStart);

        }

		//4:00AM coding journey, testing conditions you may or may not need, buiding from scatch
        /// <summary>
        /// Get a substring between two anchor strings, maximal span
        /// </summary>
        /// <param name="s">source string</param>
        /// <param name="from">search from end of this string</param>
        /// <param name="to">to beginning of this string, searching backwards, from end to start of s</param>
        /// <returns>a substring between from and to, maximal span</returns>
        public static string GetFirstStringBetweenStrings(this string s, string from, string to)
        {
            Console.Write("args from {0} to {1} = ", from, to); //debug

            //edge case
            if (string.IsNullOrEmpty(s) || string.IsNullOrEmpty(from) || string.IsNullOrEmpty(to)) return string.Empty;
            //if (from.Length >= s.Length || to.Length >= s.Length) return string.Empty; //redundant rule take care of below
            
            int idxFrom = s.IndexOf(from);
            int idxStart = idxFrom + from.Length; //we filter "not found" -1, never race condtn

            if (idxFrom == -1)       
                return string.Empty;
            else if (idxStart >= s.Length - 1) //for testing combine to 1 line, we combining a idx with a length, and w/ lengths we normally subtract 1 
            //else if (idxStart >= s.Length) //for testing combine to 1 line, we combining a idx with a length, and w/ lengths we normally subtract 1 
            {
                Console.WriteLine("r1. idxStart={0} >= (s.Length - 1)={1}", idxStart, s.Length - 1);
                //Console.WriteLine("r1. idxStart={0} >= s.Length={1}",  idxStart, s.Length);
                return string.Empty; 
            }

         

            int idxEnd = s.LastIndexOf(to); 
            //int idxEnd = s.LastIndexOf(to, idxStart); //produces unexpected results 1 when to="a" and idxStart=1 when from="a", we expect last last index of string "aaaa" which is 4-1=3

            //if (idxEnd == -1 || idxEnd > s.Length - 1 || idxStart >= idxEnd) //intially in-correction
            if (idxEnd == -1 )
                return string.Empty;
            //else if (idxEnd > s.Length - 1) //we can rule this out because if not found -1
            ////else if (idxEnd > s.Length)
            //{
            //    Console.WriteLine("r2. idxEnd={0} > (s.Length - 1)={1}", idxEnd, s.Length - 1);
            //    //Console.WriteLine("r2. idxEnd={0} > s.Length={1}", idxEnd, s.Length);
            //    return string.Empty;
            //}
            else if (idxEnd <= idxStart)
            {
                Console.WriteLine("r3. idxEnd={0} <= idxStart={1}", idxEnd, idxStart);
                return string.Empty;
            }
            
            return s.Substring(idxStart, idxEnd - idxStart);
            
        }


    }
    public class Program
    {
        public static void Main(string[] args)
        {
            string test = "abcd";
            Console.WriteLine(test);
            Console.WriteLine("=================");

            Console.WriteLine(test.GetFirstStringBetweenStrings(null, null));
            Console.WriteLine(test.GetFirstStringBetweenStrings("", ""));
            Console.WriteLine(test.GetFirstStringBetweenStrings(test, test));
            Console.WriteLine(test.GetFirstStringBetweenStrings("", test));
            Console.WriteLine(test.GetFirstStringBetweenStrings(test,""));
            
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "b"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "c"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "d"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "e"));
            
            Console.WriteLine(test.GetFirstStringBetweenStrings("e", "e"));
            
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "b"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "c"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "d"));

            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "b"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "c"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "d"));

            Console.WriteLine(test.GetFirstStringBetweenStrings("abc", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("abc", "b"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("abc", "c"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("abc", "d"));
            
            

            Console.WriteLine(test.GetFirstStringBetweenStrings(test, "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings(test, "b"));
            Console.WriteLine(test.GetFirstStringBetweenStrings(test, "c"));
            Console.WriteLine(test.GetFirstStringBetweenStrings(test, "d"));

            Console.WriteLine(test.GetFirstStringBetweenStrings("a","abc"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("b", "abc"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("c", "abc"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("d", "abc"));

            Console.WriteLine(); 
            test = "abcdabcd";
            Console.WriteLine(test);
            Console.WriteLine("=================");

            Console.WriteLine(test.GetFirstStringBetweenStrings(null, null));
            Console.WriteLine(test.GetFirstStringBetweenStrings("", ""));
            Console.WriteLine(test.GetFirstStringBetweenStrings(test, test));
            Console.WriteLine(test.GetFirstStringBetweenStrings("", test));
            Console.WriteLine(test.GetFirstStringBetweenStrings(test, ""));

            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "b"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "c"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "d"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "e"));

            Console.WriteLine(test.GetFirstStringBetweenStrings("e", "e"));

            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "b"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "c"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "d"));

            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "b"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "c"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("ab", "d"));

            Console.WriteLine(test.GetFirstStringBetweenStrings("abc", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("abc", "b"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("abc", "c"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("abc", "d"));

            Console.WriteLine(test.GetFirstStringBetweenStrings(test, "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings(test, "b"));
            Console.WriteLine(test.GetFirstStringBetweenStrings(test, "c"));
            Console.WriteLine(test.GetFirstStringBetweenStrings(test, "d"));

            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "abc"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("b", "abc"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("c", "abc"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("d", "abc"));

            Console.WriteLine(test.GetFirstStringBetweenStrings("abc", "d")); //pass


            Console.WriteLine();
            test = "aaaa";
            Console.WriteLine(test);
            Console.WriteLine("=================");

            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("aa", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("aaa", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("aaaa", "a"));

            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "a"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "aa"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "aaa"));
            Console.WriteLine(test.GetFirstStringBetweenStrings("a", "aaaa"));

            Console.WriteLine();
            test = "aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb";
            Console.WriteLine(test);
            Console.WriteLine("=================");
            Console.WriteLine(test.GetFirstStringBetweenStrings("bbbb", "aaaa"));
			
			string result = string.Empty; 
            Stopwatch sw = new Stopwatch();
            sw.Start();
            result = test.GetFirstStringBetweenStringsCleanup("aaaaaaa", "bb");
            sw.Stop();
            Console.WriteLine(result);
            Console.WriteLine(" in " + sw.ElapsedTicks + " ticks."); 
           
        }
    }
}

No comments:

Post a Comment