Thursday, December 12, 2024

Get the number of columns and rows for a HTML Table using Regex



Here's an 'adhoc' quick way to just get dimensions of an HTML Table. In this post, 
I'll focus on getting the 
number of rows and columns for an HTML Table.

You can use this algorithm for any language, just use the Regex expressions below to enumerate rows, then count columns (another Regex) within each row.

I wanted to do this to avoid using bulky and unreliable third-party libraries. So, although not advised I used Regular Expressions to do this. Getting the number of rows in an HTML table is simple enough, but for columns, it can be tricky because of jagged tables (tables using colspan=). The other motivation is that this topic was sparsely covered. 

Normally in C# the way to solve this would be to load the document using HTML Agility Pack library, or more modern Angel Sharp library are pretty heavy downloads and requires a detailed security review. Who has time for that! 

So, here's the C# regex solution getting HTML Table size live code. 



So, here's the C# regex solution getting HTML Table size.

using System;
using System.Linq;
using System.Collections.Generic;
using System.Text.RegularExpressions;

public class Program
{
	public static void Main()
	{
		string html = @"
<table>
    <caption>Employee Information for https://metadataconsulting.blogspot.com/</caption>
    <thead>
        <tr>
            <th>Name</th>
            <th>Position</th>
            <th>Office</th>
            <th>Age</th>
            <th>Start Date</th>
            <th>Salary</th>
        </tr>
    </thead>
    <tbody>
        <tr class='buttler'>
            <td>John Doe</td>
            <td>Software Engineer</td>
            <td>New York</td>
            <td>30</td>
            <td>2015-01-15</td>
            <td>$120,000</td>
        </tr>
        <tr>
            <td>Jane Smith</td>
            <td>Project Manager</td>
            <td>London</td>
            <td>40</td>
            <td>2010-03-25</td>
            <td>$150,000</td>
        </tr>
        <tr>
            <td>Emily Johnson</td>
            <td>Designer</td>
            <td>San Francisco</td>
            <td>28</td>
            <td>2018-07-12</td>
            <td>$100,000</td>
        </tr>
    </tbody>
    <tfoot>
        <tr>
            <td colspan=6>End of Employee Information</td>
        </tr>
    </tfoot>
</table>"; 

		string patternRow = @"(<tr.*?>)((?!</tr>).)+?(.*?)(</tr>)";
    	RegexOptions optionsRow = RegexOptions.Singleline;
    	Regex regxHTMLRow = new Regex(patternRow, optionsRow);
		
		int colcnt = 0;
		int cellcnt = 0; 
		double avgcolcnt = 0;
        int finalcolcnt = 0;  

		List<int> avgColCnt = new List<int>(); 
        Regex regxColCount = new Regex(@"<t[hd].*>(.*?)</t[hd]>", RegexOptions.Multiline);
        
		MatchCollection mcoll = regxHTMLRow.Matches(html);
		int m = 0;
        int r = 0; 
        if (mcoll.Count > 0)
        {
            foreach (Match match in mcoll)
            {
                m++;
                
                Console.WriteLine("Match #{0}, Match Row ='{1}'", m, match.Groups[3].Value);
                
       			MatchCollection mcollColCnt = regxColCount.Matches(match.Groups[3].Value);
				
				if (mcoll.Count > 0)
				{
					foreach (Match mc in mcollColCnt)
					{
						if (mc.Success) 
						{
							r++; 
							Console.WriteLine("\r\n\t\t\tMatch #{0}, Match Value ='{1}' ", r, mc.Value);
							cellcnt++;
							
						}
					}
					avgColCnt.Add(cellcnt); 
					colcnt = Math.Max(colcnt, cellcnt);
					cellcnt = 0; 
					r = 0; 
					Console.WriteLine();
				}
				
            }
        }
		Console.WriteLine("Max number of columns {0}", colcnt);	
		
		Console.WriteLine("Number of rows {0}", avgColCnt.Count());
 		
        Console.WriteLine("List: " + string.Join(", ", avgColCnt));
		
		//List<int> avgListReduced = avgColCnt.Where(x => x != avgColCnt.Min() && x != avgColCnt.Max()).ToList(); //removes all values that match max
		
		avgColCnt.Sort(); 
		
		List<int> avgListReduced = avgColCnt; 
	
		if (avgListReduced.Count > 2) {
			
			// Remove the first element
			avgListReduced.RemoveAt(0);

			// Remove the last element
			avgListReduced.RemoveAt(avgListReduced.Count - 1);
				
			Console.WriteLine("Number of avgListReduced {0}", avgListReduced.Count());
			// Print the updated list
			Console.WriteLine("Updated list: " + string.Join(", ", avgListReduced));
			
			avgcolcnt = avgListReduced.Average(); 

			try {		
			
				int intavgcolcnt = (int)Math.Round(avgcolcnt, MidpointRounding.AwayFromZero);

				Console.WriteLine("Average number of columns " + intavgcolcnt);
				
				//average effective after 3 or more rows?
				if (avgListReduced.Count >= 5 && intavgcolcnt != colcnt) 
					finalcolcnt =  intavgcolcnt;
				else
					finalcolcnt =  colcnt;	
			
			} 
			catch 
			{
				finalcolcnt =  colcnt;
			}
			

		}
		else {
				
				finalcolcnt =  colcnt;
		}
			
		Console.WriteLine("Final number of columns {0}",finalcolcnt );
				
		
	}
}

No comments:

Post a Comment