Thursday, February 11, 2021

C# .NET Get difference between two unordered not unique lists using BitArrays for super fast speed


Here's how to get the difference between two lists preserving duplicates in each list that is super fast, using BitArrays. 

It's a counter intuitive approach since it loops over the lists 3 times, but since we are just setting a bit, it's super efficient. Definitely, interesting idea to use for other situations. 

As in my previous post, i dismiss all the Stack Overflow solutions with live code, and I presented my 1st attempt at the solution. 

This post demonstrates my super fast implementation, enjoy! 

//UNORDERED, NOT UNIQUE lists, lists with duplicates 

string[] arrA = new string[] { "1", "1", "2", "3", "4", "5", "9", "7", "7", "7" };
string[] arrB = new string[] { "1", "1", "2", "2", "3", "4", "5", "5", "5", "5" };

//DESIRED RESULT

2,5,5,5,7,7,7,9 OR 2,5,5,5,9,7,7,7 OR 9,7,7,7,2,5,5,5  //UNORDERED IS SUFFICIENT

NOTE: Linq Except but it's designed as a set command, and as a consequence it removes duplicates.

IEnumerable<string> AminuBdifference = arrA.Except(arrB); //removes dups
IEnumerable<string> BminuAdifference = arrB.Except(arrA); //remove dups

IEnumerable<string> Uniondifference = arrA.Union(arrb); //ideally, but no good because dups are removed

See https://dotnetfiddle.net/oHOms5  which demonstrates Linq Except use.


Below is my super fast implementation, using a pure BitArray implementations, enjoy!

Use case: For list with less than 100 values, otherwise use
 ExceptIndistinct.

 

Source Code
using System;
using System.Collections; 
using System.Collections.Generic;
using System.Linq;
using System.Diagnostics; 

//CHALLENGE - Get difference between two unorderd, not unique lists, duplicates allowed within a list and across both lists.
//SOLUTION - https://metadataconsulting.blogspot.com/2021/02/CSharp-dotNet-Get-difference-between-two-unordered-not-unique-lists-aka-duplicates-allowed-within-a-list-and-across-both-lists.html
//OPTIMIZE - THIS IS IT using BitArray - https://metadataconsulting.blogspot.com/2021/02/CSharp-dotNet-Get-difference-between-two-unordered-not-unique-lists-lightning-fast-optimized.html
//OPTIMIZEv2 - All BitArray imp - https://metadataconsulting.blogspot.com/2021/02/CSharp-dotNet-Get-difference-between-two-unordered-not-unique-lists-using-BitArrays-for-super-fast-speed.html
//           - use ExceptIndistinct for >100 items 
public static class Program
{
	public static int loopcnt = 0;
	
	//https://stackoverflow.com/questions/2975944/except-has-similar-effect-to-distinct
	public static IEnumerable<T> ExceptFrom<T>(
		this IEnumerable<T> first,
		IEnumerable<T> second) //,
		//IEqualityComparer<TSource> comparer)
		{
			if (first == null)
				throw new ArgumentNullException();

			if (second == null)
				throw new ArgumentNullException();

			//var secondSet = second as HashSet<TSource> ?? // this trick ignore the comparer
			//				second.ToHashSet(comparer ?? EqualityComparer<TSource>.Default);

		    var secondSet = second as HashSet<T> ?? // this trick ignore the comparer
							second.ToHashSet(EqualityComparer<T>.Default);

			// Contains is O(1) for HashSet.
			return first.Where(v => !secondSet.Equals(v)); //change to equals, not Contains
		}

	//use after 100 items
    public static IEnumerable<T> ExceptIndistinct<T>(this IEnumerable<T> first, IEnumerable<T> second) 
	{
		if(first == null) { throw new ArgumentNullException("first"); }
		if(second == null) { throw new ArgumentNullException("second"); }

		var secondCounts = new Dictionary<T, int>(EqualityComparer<T>.Default);
		int count;
		int nullCount = 0;

		foreach(var item in second)
		{
			loopcnt++; 
			if(item == null) { nullCount++; }
			else if(secondCounts.TryGetValue(item, out count)) { secondCounts[item] = count + 1; }
			else { secondCounts.Add(item, 1); }
			
		}

		foreach(var item in first)
		{
			loopcnt++; 
			if(item == null)
			{
				nullCount--;
				if(nullCount < 0) { yield return item; }
			}
			else if(secondCounts.TryGetValue(item, out count))
			{
				if(count == 0)
				{
					secondCounts.Remove(item);
					yield return item;
				}
				else { secondCounts[item] = count - 1; }
			}
			else { yield return item; }
		}
	}

	public static void Main()
	{
			Stopwatch sw = new Stopwatch();
			

			//RANDOM case - typical use
			string[] arrA = new string[] { "a", "7", "1", "2", "3", "4", "5", "9", "7", "7", "1","222222222222222222","22222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222323" };
			string[] arrB = new string[] { "a", "1", "1", "2", "2", "3", "4", "5", "5", "5", "5","999999999999999999","22222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222323" };
			
			sw.Start(); 
			//WANT           2,5,5,5,9,7,7,7

			//WORST case - entirely different, but have 1 value in common to make it interesting
			//string[] arrA = new string[] { "1", "1", "2", "3", "4", "5", "6", "7", "7", "7" };//1st list
			//string[] arrB = new string[] { "8", "9", "10", "12", "1", "14", "15", "15", "11", "9" };//2nd list

			//BEST case - identical
			//string[] arrA = new string[] { "1", "1", "2", "3", "4", "5", "6", "7", "7", "7" };//1st list
			//string[] arrB = new string[] { "1", "1", "2", "3", "4", "5", "6", "7", "7", "7" };//2nd list


			sw.Stop();
			Console.WriteLine("RANDOM CASE -two unorderd, not unique lists ");
			Console.WriteLine("A  = " + string.Join(",", arrA));
			Console.WriteLine("B  = " + string.Join(",", arrB));
	    	Console.WriteLine("A count = " + arrA.Count());
			Console.WriteLine("B count = " + arrB.Count());
			
			sw.Restart();

			string[] arrBminusA = new string[arrB.Length];
			string[] arrAminuB = new string[arrA.Length];   
			BitArray bitArrAminusB = new BitArray(arrA.Length);
			BitArray bitarrBminusA = new BitArray(arrB.Length);
			
			//May seem counter intuitive to run this over 3 loop but it's faster ~ 10 ticks
			int a = 0;
            for (int b = 0; b < arrB.Length; b++)
            {
                for (a = 0; a < arrA.Length; a++)
                {
                    loopcnt++;//not need for timing analysis 

                    if (!bitArrAminusB[a] && arrA[a].Length == arrB[b].Length && arrA[a] == arrB[b])
                    {
                        bitArrAminusB.Set(a, true); //A-B, list A difference/subtract of list B  
                        bitarrBminusA.Set(b, true); //B-A, list B difference/subtract of list A  
                        break;

                    }
                }
            }

            int idxa = 0;
            for (int aa = 0; aa < arrA.Length; aa++)
            {
                if (!bitArrAminusB[aa])
                    arrAminuB[idxa++] = arrA[aa];
                
            }

            int idxb = 0;
            for (int bb = 0; bb < arrB.Length; bb++)
            {
                if (!bitarrBminusA[bb])
                    arrBminusA[idxb++] = arrB[bb];

            }
			sw.Stop();

           Console.WriteLine("(A-B) Result = " + string.Join(",", arrAminuB).Trim(','));
           //Console.WriteLine("D  = " + string.Join(",", D));
           //Console.WriteLine("E  = " + string.Join(",", E)); 

           Console.WriteLine("(B-A) Result = " + string.Join(",", arrBminusA).TrimEnd(','));

           Console.WriteLine("No. of loops = " + loopcnt);
           Console.WriteLine("No. of loops = " + arrA.Length*2 + " - Post copy using BitArray to a sized array");

		   //Un	
           Console.WriteLine("Result Unorderd Final A then B = " + (string.Join(",", arrAminuB)).Trim(',') + "," + string.Join(",", arrBminusA).Trim(',') + " in " + sw.ElapsedTicks + " ticks.");
           Console.WriteLine("Result Unorderd Final B then A = " + (string.Join(",",  arrBminusA)).Trim(',') + "," + string.Join(",",arrAminuB).Trim(',') + " in " + sw.ElapsedTicks + " ticks.");
 		   Console.WriteLine("Note: In a VS solution this is 47 ticks.");
           
           List<string> D = arrBminusA.ToList().Concat(arrAminuB.ToList()).ToList();
           D.Sort();

           Console.WriteLine("Result  Ordered Final = " + string.Join(",", D).TrimStart(','));
		   Console.WriteLine("Result  O(n^2+2n) but very fast - can you beat it?"); 
		
		   loopcnt = 0; 
		   Console.WriteLine("\n\nExceptIndistinct - Reddit Solution by UninformedPleb\n"); 
           sw.Reset(); 
		   
		   
		   List<string> listA = new List<string> { "a", "7", "1", "2", "3", "4", "5", "9", "7", "7", "1","222222222222222222","22222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222323" };
		   List<string> listB = new List<string>  { "1", "1", "2", "2", "3", "4", "5", "5", "5", "5" };
		   
		   sw.Start(); 
           List<string> listC =  listA.ExceptIndistinct(listB).ToList();
		   List<string> listD =  listB.ExceptIndistinct(listA).ToList();

     	   sw.Stop();

		   Console.WriteLine("No. of loops = " + loopcnt);
		   Console.WriteLine("Result Unorderd Final A then B = " + (string.Join(",", listC)).Trim(',') + "," + string.Join(",", listD).Trim(',') + " in " + sw.ElapsedTicks + " ticks.");
			
		   loopcnt = 0; 
		   Console.WriteLine("\n\nExceptIndistinct - Reddit Solution by UninformedPleb using arrays\n"); 
           sw.Reset(); 
		   
		   string[] arrAA = new string[] { "a", "7", "1", "2", "3", "4", "5", "9", "7", "7", "1","222222222222222222","22222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222323" };
    	   string[] arrBB = new string[] { "1", "1", "2", "2", "3", "4", "5", "5", "5", "5","999999999999999999","22222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222" };
  		   sw.Start(); 
		   
           string[] listCC =  arrAA.ExceptIndistinct(arrBB).ToArray(); 
		   string[] listDD =  arrBB.ExceptIndistinct(arrAA).ToArray(); 

     	   sw.Stop();
		
		   Console.WriteLine("No. of loops = " + loopcnt);
		   Console.WriteLine("Result Unorderd Final A then B = " + (string.Join(",", listC)).Trim(',') + "," + string.Join(",", listD).Trim(',') + " in " + sw.ElapsedTicks + " ticks.");
		
		
		
		   loopcnt = 0; 
		   Console.WriteLine("\n\nStackOverflow ExceptFrom\n"); 
           sw.Reset(); 
		 
		   string[] arrAAA = new string[] { "a", "7", "1", "2", "3", "4", "5", "9", "7", "7", "1","222222222222222222","22222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222323" };
    	   string[] arrBBB = new string[] { "1", "1", "2", "2", "3", "4", "5", "5", "5", "5","999999999999999999","22222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222" };
  		   sw.Start(); 
		   
           string[] listCCC =  arrAAA.ExceptFrom(arrBBB).ToArray(); 
		   string[] listDDD =  arrBBB.ExceptFrom(arrAAA).ToArray(); 

     	   sw.Stop();
		
		   Console.WriteLine("No. of loops = " + loopcnt);
		   Console.WriteLine("Result Unorderd Final A then B = " + (string.Join(",", listC)).Trim(',') + "," + string.Join(",", listD).Trim(',') + " in " + sw.ElapsedTicks + " ticks.");
		
		
		
			
	}
}

No comments:

Post a Comment