Thursday, December 5, 2024

Get the number of columns in an HTML Table using Regex



Here's a provisional 'adhoc' way to just get dimensions of an HTML Table quickly. In this post I focus on getting the harder of the two dimensions, and that is getting the number of columns in an HTML Table. I wanted to do this to avoid using bulky and unreliable third-party libraries. So, although not advised I used Regular Expressions to do this. Getting the number of rows in an HTML table is simple enough, but for columns, this topic was sparsely covered. 

Normally in C# the way to solve this would be to load the document using HTML Agility Pack library, or more modern Angel Sharp library are pretty heavy downloads and requires a detailed security review. Who has time for that! 

So, here's the regex solution to counting HTML columns live code. 



So here's the regex solution to counting HTML columns. 




using System;
using System.Linq;
using System.Collections.Generic;
using System.Text.RegularExpressions;

public class Program
{
	public static void Main()
	{
		string html = @"
<table>
    <caption>Employee Information for https://metadataconsulting.blogspot.com/</caption>
    <thead>
        <tr>
            <th>Name</th>
            <th>Position</th>
            <th>Office</th>
            <th>Age</th>
            <th>Start Date</th>
            <th>Salary</th>
        </tr>
    </thead>
    <tbody>
        <tr class='buttler'>
            <td>John Doe</td>
            <td>Software Engineer</td>
            <td>New York</td>
            <td>30</td>
            <td>2015-01-15</td>
            <td>$120,000</td>
        </tr>
        <tr>
            <td>Jane Smith</td>
            <td>Project Manager</td>
            <td>London</td>
            <td>40</td>
            <td>2010-03-25</td>
            <td>$150,000</td>
        </tr>
        <tr>
            <td>Emily Johnson</td>
            <td>Designer</td>
            <td>San Francisco</td>
            <td>28</td>
            <td>2018-07-12</td>
            <td>$100,000</td>
        </tr>
    </tbody>
    <tfoot>
        <tr>
            <td colspan=6>End of Employee Information</td>
        </tr>
    </tfoot>
</table>"; 

		string patternRow = @"(<tr.*?>)((?!</tr>).)+?(.*?)(</tr>)";
    	RegexOptions optionsRow = RegexOptions.Singleline;
    	Regex regxHTMLRow = new Regex(patternRow, optionsRow);
		
		int colcnt = 0;
		int cellcnt = 0; 
		double avgcolcnt = 0;
        int finalcolcnt = 0;  

		List<int> avgColCnt = new List<int>(); 
        Regex regxColCount = new Regex(@"<t[hd].*>(.*?)</t[hd]>", RegexOptions.Multiline);
        
		MatchCollection mcoll = regxHTMLRow.Matches(html);
		int m = 0;
        int r = 0; 
        if (mcoll.Count > 0)
        {
            foreach (Match match in mcoll)
            {
                m++;
                
                Console.WriteLine("Match #{0}, Match Row ='{1}'", m, match.Groups[3].Value);
                
       			MatchCollection mcollColCnt = regxColCount.Matches(match.Groups[3].Value);
				
				if (mcoll.Count > 0)
				{
					foreach (Match mc in mcollColCnt)
					{
						if (mc.Success) 
						{
							r++; 
							Console.WriteLine("\r\n\t\t\tMatch #{0}, Match Value ='{1}' ", r, mc.Value);
							cellcnt++;
							
						}
					}
					avgColCnt.Add(cellcnt); 
					colcnt = Math.Max(colcnt, cellcnt);
					cellcnt = 0; 
					r = 0; 
					Console.WriteLine();
				}
				
            }
        }
		Console.WriteLine("Max number of columns {0}", colcnt);	
		
		Console.WriteLine("Number of rows {0}", avgColCnt.Count());
 		
        Console.WriteLine("List: " + string.Join(", ", avgColCnt));
		
		//List<int> avgListReduced = avgColCnt.Where(x => x != avgColCnt.Min() && x != avgColCnt.Max()).ToList(); //removes all values that match max
		
		avgColCnt.Sort(); 
		
		List<int> avgListReduced = avgColCnt; 
	
		if (avgListReduced.Count > 2) {
			
			// Remove the first element
			avgListReduced.RemoveAt(0);

			// Remove the last element
			avgListReduced.RemoveAt(avgListReduced.Count - 1);
				
			Console.WriteLine("Number of avgListReduced {0}", avgListReduced.Count());
			// Print the updated list
			Console.WriteLine("Updated list: " + string.Join(", ", avgListReduced));
			
			avgcolcnt = avgListReduced.Average(); 

			try {		
			
				int intavgcolcnt = (int)Math.Round(avgcolcnt, MidpointRounding.AwayFromZero);

				Console.WriteLine("Average number of columns " + intavgcolcnt);
				
				//average effective after 3 or more rows?
				if (avgListReduced.Count >= 5 && intavgcolcnt != colcnt) 
					finalcolcnt =  intavgcolcnt;
				else
					finalcolcnt =  colcnt;	
			
			} 
			catch 
			{
				finalcolcnt =  colcnt;
			}
			

		}
		else {
				
				finalcolcnt =  colcnt;
		}
			
		Console.WriteLine("Final number of columns {0}",finalcolcnt );
				
		
	}
}

No comments:

Post a Comment