Searching the internet for a good solution turned up nothing so here's my word count program.
It properly takes into account apostrophes and numbers and works to count words in many programming languages. It does not count operands as words. So "if (x && 2)", is a word of 3, does not count &&.
It properly takes into account apostrophes and numbers and works to count words in many programming languages. It does not count operands as words. So "if (x && 2)", is a word of 3, does not count &&.
//Edge Cases - apostrophe in middle of a word
//FAILED = 1'1 - 2 words
//PASS - "O'Connel" - 1 word
//PASS - goodness' - 1 word //known formally as an Elision or broadly Contraction
//PASS - 'em - 1 word //known formally as an Elision or more broadly Contraction
//Edge Case - Numbers 1,000.00 -- thousands separator - 1 word
//Edge Case - Numbers 1,000.00 -- decimal - 1 word
//FAILED = 1'1 - 2 words
//PASS - "O'Connel" - 1 word
//PASS - goodness' - 1 word //known formally as an Elision or broadly Contraction
//PASS - 'em - 1 word //known formally as an Elision or more broadly Contraction
//Edge Case - Numbers 1,000.00 -- thousands separator - 1 word
//Edge Case - Numbers 1,000.00 -- decimal - 1 word
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | using System;using System.Collections.Generic; public static class Program { //add U+2019 from https://en.wikipedia.org/wiki/Apostrophe //you can add others, if you think so - http://www.fileformat.info/search/google.htm?q=+SINGLE+QUOTATION+MARK public static bool isUnicodeApostrophe(this char c) { if (c == '\'' || c == '\u2019') return true; else return false; //switch (c) { // case '\'': // case '\u2019': //’ Right Single Quotation Mark // return true; // break; //ignore error, this is quicker // default: // return false; // break; //ignore error, this is quicker //} } /// <summary> /// Counts words handling apostrophe in middle of a word and numbers. /// Good for counting words of code. /// </summary> /// <param name="s">hidden, static</param> /// <returns></returns> public static int CountWordsforCode(this string s) { if (string.IsNullOrEmpty(s)) return 0; int wc = 0; //word count int apc = 0; //apostrophe count int tc = 0; //thousands count int dc = 0; //decimal count or IP Address seperator, etc if (char.IsLetterOrDigit(s[0])) { wc = 1; } //bounds condition, is 1st char start of word? for (int i = 1; i < s.Length; i++) { //Edge Case - apostrophe in middle of a word //FAILED = 1'1 - 2 words //PASS - "O'Connel" - 1 word //PASS - goodness' - 1 word //known formally as an Elision or broadly Contraction //PASS - 'em - 1 word //known formally as an Elision or more broadly Contraction if (i < s.Length - 1 && char.IsLetter(s[i - 1]) && s[i].isUnicodeApostrophe() && char.IsLetter(s[i + 1])) { apc++; //Console.WriteLine(s + " apostrophe cnt tc = " + apc); } //Edge Case - Numbers 1,000.00 -- thousands separator if (i < s.Length - 1 && char.IsNumber(s[i - 1]) && (s[i] == ',') && char.IsNumber(s[i + 1])) { tc++; //Console.WriteLine(s + " thousands cnt tc = " + tc); } //Edge Case - Numbers 1,000.00 -- decimal if (i < s.Length - 1 && char.IsNumber(s[i - 1]) && (s[i] == '.') && char.IsNumber(s[i + 1])) { dc++; //Console.WriteLine(s + " decimal cnt dc = " + dc); } //Main enumeration is pretty simple //detect previous character is word seperator aka boundary if (char.IsWhiteSpace(s[i - 1]) || char.IsPunctuation(s[i - 1]) || char.IsSymbol(s[i - 1])) { if (char.IsLetterOrDigit(s[i])) //is current char start of a new word wc++; } } //Console.WriteLine(s + " end... word c = " + wc); return wc - (apc * 2) + apc - (tc * 2) + tc - (dc * 2) + dc; } public static void Main() { //CAN SOMEONE IMPLEMENT ALL THESE TEST CASES! TO MANY FOR ME RIGHT NOW //https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/operators/ Dictionary<string, bool> CSharpOperands = new Dictionary<string, bool>() { //t ? x : y - super special edge case //2 characters in len, true {"<=",true},//binary op = space or letter or digit can surround this operator {">>=",true},//binary op {"<<=",true},//binary op {"/= ",true},//binary op {"||",true},//binary op {"|=",true},//binary op {"^=",true},//binary op {"?[",true},//binary op {"??",true},//binary op {"?.",true},//binary op {">>",true},//binary op {">=",true},//binary op {"==",true},//binary op {"<<",true},//binary op {"->",true},//binary op {"-=",true},//binary op {"--",true}, //--x prefix or suffix, not binary -> var x = 1 -- 2; is an error {"+=",true},//binary op {"++",true}, //++x prefix or suffix, not binary -> var x = 1 ++ 2; is an error {"*=",true},//binary op {"&=",true},//binary op {"&&",true},//binary op {"%=",true},//binary op {"=>",true},//binary op {"!=",true},//binary op //1 character in leng, false {"!",false}, //prefix operator {"~",false}, //prefix operator {"|",false}, //binary op only {".",false}, //binary op for words, not for numbers (decimal, considered 1 word 100.00) {"-",false}, //prefix operator & binary op {"+",false}, //prefix operator & binary op }; //mainly edge cases string[] tokens = { "e", "ab", "abc", "abcdef", "", "a,b", "e e a e", "e}}}}}})*", "(CAN'T, DON'T)", "{'val1','val2'}", "\"what's here\"", "\"1'1 2 3 what's he’s isn't\"", "for goodness’ sake", "'em exuse me", "\"what 'dillygrout' is?\"", "\"word-for-word\"", "newToolStripMenuItem_Click(object sender, EventArgs e)", "(tabControl1.TabCount + 1).ToString();", "char.IsLetter(s[i-1])", "10.2c", "172.168.0.0", "1,000.00", "1,000.", "http://en.wikipedia.org/wiki/Rice's_theorem", "<script type=\"text/javascript\">" }; foreach (var token in tokens) Console.WriteLine(token + "\nword count = " + token.CountWordsforCode() + "\n"); } } |
No comments:
Post a Comment