
Thursday, January 7, 2021

C# .NET How to get all emails from a HTML page with a href inner text

The ExtractEmailswithInnerText method below will extract emails from an HTML page in the format "name <>" using HTMLAgilityPack.

Here's a typical example of  an email address we find on a webpage

which is marked-up with following HTML 

<a href=""></a>

where the inner text is "".

Therefore the expect result is this aliased email address, <>

Source Code

using System;using System.Text; using System.Linq;
public class Program
	/// <summary>
	/// Extract a href with mailto (emails) links with inner text
	/// </summary>
	public static string ExtractEmailswithInnerText(string s)
		string mailto = "mailto:"; 
		//removal ASCII and UNICODE control characters
		string h = new String(s.Where(c => !char.IsControl(c)).ToArray());

		StringBuilder sb = new StringBuilder(h.Length); 
			HtmlAgilityPack.HtmlDocument htmldoc  = new HtmlAgilityPack.HtmlDocument();
			//var urls = html.DocumentNode.SelectNodes("//a[@href!='']").Select(i => i.Attributes["href"].Value);

			if (htmldoc.DocumentNode.SelectNodes("//a/@href").Count > 0)

				foreach (HtmlAgilityPack.HtmlNode node in htmldoc.DocumentNode.SelectNodes("//a/@href"))
					if (node.Name == "a")
						if (node.Attributes["href"].Value.StartsWith(mailto))
							sb.AppendLine(string.Concat(node.InnerText.Trim(), " <", node.Attributes["href"].Value.Replace(mailto, string.Empty), ">"));


				return sb.ToString();
				return string.Empty;
		catch //(Exception ex)
			//bad formed HTML error handle it

		return string.Empty;
	public static void Main()
		string html = "<a href=\"\">Home</a><div class=\"footerLeft\"><p>©<script type=\"text/javascript\">copyright=new Date();update=copyright.getFullYear();document.write(update);</script>2021      Mark Russinovich.  All Rights Reserved.</p>    <p>Email: <a href=\"\"></a></p></div>";
		//Best practice, we should check for this empty case, but not really common, defeats purpose
		//string html2 = "<a href=\"\">Home</a><div class=\"footerLeft\"><p>©<script type=\"text/javascript\">copyright=new Date();update=copyright.getFullYear();document.write(update);</script>2021      Mark Russinovich.  All Rights Reserved.</p>    <p>Email: <a href=\"\"></a></p></div>";
		string html3 = "<div id=\"post\"><article class=\"article single\"><header class=\"article-header\"><h1>Contact</h1></header><div class=\"article-content\"><p>If you have any questions, comments or concerns, please email</p></div></article> </div>"; 

No comments:

Post a Comment