Tuesday, December 9, 2025

Enumerating the average size of Office and PDF documents - C++ code and tool


Here's some 'vibe coding' to produce a fast C++ code to calculate average size of Office documents .docx, xls, ppt and pdfs for your organization. It scans a drive letter, here's a command line example.

c:\>calcavgsizedocs.exe c:\

This code features: 
  • Long path support (\\?\)
  • Skip reparse points / junctions to avoid infinite loops
  • Unicode-safe console output
  • Multi-threaded scanning + progress + current folder
  • Total docs scanned with thousands separators
  • Ctrl-C gracefully exits
  • VS2010-compatible, no C4996 warnings
5$ USD for a compiled pro version with help exe, email at 

Pro Version

Usage: CalcAvgSizeDocs <drive_letter>
Example: CalcAvgSizeDocs C:

Options:
  -h, --help    Show this help message
  -v, --version Show version info
  -s, --silent  Silence current progress output


This program is provided "AS-IS", without warranty of
merchantability or fitness for a particular purpose.


C++ Code basic version

// CalcAvgSizeDocs.cpp : Defines the entry point for the console application.
//

#include "stdafx.h" required for VisualStudio 2010 win32 console app, with header files or remove
#include <windows.h>
#include <string>
#include <iostream>
#include <vector>
#include <io.h>
#include <fcntl.h>

// ------------------------------------------------------------
// Global Data Structures
// ------------------------------------------------------------

struct Stats
{
    volatile unsigned __int64 count;
    volatile unsigned __int64 totalBytes;

    Stats() : count(0), totalBytes(0) {}
};

Stats docxStats, xlsxStats, pptxStats, pdfStats;

// Work queue
std::vector<std::wstring> dirQueue;
CRITICAL_SECTION queueLock;

// Progress & control
volatile LONG totalDirsQueued = 0;
volatile LONG totalDirsProcessed = 0;
volatile bool scanningFinished = false;
volatile bool cancelRequested = false;

// Current folder display
std::wstring currentFolder;
CRITICAL_SECTION currentFolderLock;

// Total documents scanned
volatile unsigned __int64 totalDocsScanned = 0;

// ------------------------------------------------------------
// Utility
// ------------------------------------------------------------

bool EndsWith(const std::wstring& s, const std::wstring& ext)
{
    if (s.length() < ext.length()) return false;
    return _wcsicmp(s.c_str() + (s.length() - ext.length()), ext.c_str()) == 0;
}

void Accumulate(Stats &stat, unsigned __int64 size)
{
    InterlockedIncrement((LONG*)&stat.count);
    InterlockedExchangeAdd64((LONGLONG*)&stat.totalBytes, size);
    InterlockedExchangeAdd64((LONGLONG*)&totalDocsScanned, 1);
}

void ProcessFile(const std::wstring& path)
{
    WIN32_FILE_ATTRIBUTE_DATA fad;
    if (!GetFileAttributesExW(path.c_str(), GetFileExInfoStandard, &fad))
        return;

    unsigned __int64 size =
        (((unsigned __int64)fad.nFileSizeHigh) << 32) | fad.nFileSizeLow;

    if (EndsWith(path, L".docx"))
        Accumulate(docxStats, size);
    else if (EndsWith(path, L".xlsx"))
        Accumulate(xlsxStats, size);
    else if (EndsWith(path, L".pptx"))
        Accumulate(pptxStats, size);
    else if (EndsWith(path, L".pdf"))
        Accumulate(pdfStats, size);
}

// Format time in HH:MM:SS.mmm
std::wstring FormatDuration(DWORD ms)
{
    DWORD seconds = ms / 1000;
    DWORD minutes = seconds / 60;
    DWORD hours   = minutes / 60;

    seconds %= 60;
    minutes %= 60;

    wchar_t buf[64];
    swprintf(buf, 64, L"%02u:%02u:%02u.%03u",
             hours, minutes, seconds, ms % 1000);

    return buf;
}

// Format numbers with thousands separators
std::wstring FormatWithThousandsSeparator(unsigned __int64 value)
{
    wchar_t buf[64];
    _ui64tow_s(value, buf, 64, 10);
    std::wstring s(buf);

    int insertPosition = (int)s.length() - 3;
    while (insertPosition > 0)
    {
        s.insert(insertPosition, L",");
        insertPosition -= 3;
    }
    return s;
}

// ------------------------------------------------------------
// CTRL-C Handler
// ------------------------------------------------------------

BOOL WINAPI CtrlHandler(DWORD ctrlType)
{
    if (ctrlType == CTRL_C_EVENT ||
        ctrlType == CTRL_BREAK_EVENT ||
        ctrlType == CTRL_CLOSE_EVENT)
    {
        cancelRequested = true;
        scanningFinished = true;
        std::wcout << L"\n\nCTRL-C detected... stopping scan...\n";
        return TRUE;
    }
    return FALSE;
}

// ------------------------------------------------------------
// Worker Thread
// ------------------------------------------------------------

DWORD WINAPI WorkerProc(LPVOID)
{
    for (;;)
    {
        if (cancelRequested)
            break;

        std::wstring folder;

        EnterCriticalSection(&queueLock);
        if (!dirQueue.empty())
        {
            folder = dirQueue.back();
            dirQueue.pop_back();
        }
        else
        {
            LeaveCriticalSection(&queueLock);
            break;
        }
        LeaveCriticalSection(&queueLock);

        InterlockedIncrement(&totalDirsProcessed);

        EnterCriticalSection(&currentFolderLock);
        currentFolder = folder;
        LeaveCriticalSection(&currentFolderLock);

        if (cancelRequested)
            break;

        // Prepend \\?\ for long path support
        std::wstring longPath = L"\\\\?\\" + folder;
        std::wstring searchPath = longPath + L"\\*";

        WIN32_FIND_DATAW fd;
        HANDLE hFind = FindFirstFileW(searchPath.c_str(), &fd);
        if (hFind == INVALID_HANDLE_VALUE)
            continue;

        do
        {
            if (cancelRequested)
                break;

            const wchar_t* name = fd.cFileName;
            if (wcscmp(name, L".") == 0 || wcscmp(name, L"..") == 0)
                continue;

            std::wstring fullPath = folder + L"\\" + name;

            // Skip junctions / reparse points to avoid loops
            if ((fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) &&
                !(fd.dwFileAttributes & FILE_ATTRIBUTE_REPARSE_POINT))
            {
                EnterCriticalSection(&queueLock);
                dirQueue.push_back(fullPath);
                LeaveCriticalSection(&queueLock);
                InterlockedIncrement(&totalDirsQueued);
            }
            else if (!(fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
            {
                ProcessFile(fullPath);
            }

        } while (FindNextFileW(hFind, &fd));

        FindClose(hFind);
    }

    return 0;
}

// ------------------------------------------------------------
// Progress Thread
// ------------------------------------------------------------

DWORD WINAPI ProgressProc(LPVOID)
{
    while (!scanningFinished && !cancelRequested)
    {
        LONG done = totalDirsProcessed;
        LONG total = totalDirsQueued;
        double pct = (total > 0) ? (done * 100.0 / total) : 0.0;

        EnterCriticalSection(&currentFolderLock);
        std::wcout << L"\rProcessed: " << done
                   << L"/" << total
                   << L" (" << (int)pct << L"%) "
                   << L"Current: " << currentFolder
                   << std::flush;
        LeaveCriticalSection(&currentFolderLock);

        Sleep(500);
    }
    return 0;
}

// ------------------------------------------------------------
// Print Stats
// ------------------------------------------------------------

void PrintStats(const wchar_t* label, const Stats& s)
{
    if (s.count == 0)
    {
        std::wcout << label << L": No files found\n";
        return;
    }

    double avg = (double)s.totalBytes / (double)s.count;
    double kb = avg / 1024.0;

    std::wcout << label << L": " << kb
               << L" KB average (" << s.count << L" files)\n";
}

// ------------------------------------------------------------
// MAIN
// ------------------------------------------------------------

int wmain(int argc, wchar_t* argv[])
{
    // Enable UTF-8 output
    SetConsoleOutputCP(CP_UTF8);
    _setmode(_fileno(stdout), _O_U8TEXT);

    SetConsoleCtrlHandler(CtrlHandler, TRUE);

    DWORD startTime = GetTickCount();

    // Determine drive
    std::wstring root;
    if (argc >= 2)
    {
        std::wstring arg = argv[1];

        if (arg.length() == 1)
            arg += L":\\";
        else if (arg.length() == 2 && arg[1] == L':')
            arg += L"\\";
        else if (arg[arg.length() - 1] != L'\\')
            arg += L"\\";

        root = arg;
    }
    else
    {
        root = L"C:\\";
    }

    std::wcout << L"Scanning drive: " << root << L"\n";

    InitializeCriticalSection(&queueLock);
    InitializeCriticalSection(&currentFolderLock);

    dirQueue.push_back(root);
    InterlockedIncrement(&totalDirsQueued);

    HANDLE hProgress = CreateThread(NULL, 0, ProgressProc, NULL, 0, NULL);

    SYSTEM_INFO si;
    GetSystemInfo(&si);
    int numThreads = si.dwNumberOfProcessors;

    std::vector<HANDLE> threads;
    for (int i = 0; i < numThreads; ++i)
    {
        HANDLE h = CreateThread(NULL, 0, WorkerProc, NULL, 0, NULL);
        threads.push_back(h);
    }

    WaitForMultipleObjects((DWORD)threads.size(), &threads[0], TRUE, INFINITE);
    scanningFinished = true;
    WaitForSingleObject(hProgress, INFINITE);

    DeleteCriticalSection(&queueLock);
    DeleteCriticalSection(&currentFolderLock);

    DWORD endTime = GetTickCount();
    DWORD elapsed = endTime - startTime;

    std::wcout << L"\n\nResults (partial if cancelled):\n";
    PrintStats(L"DOCX", docxStats);
    PrintStats(L"XLSX", xlsxStats);
    PrintStats(L"PPTX", pptxStats);
    PrintStats(L"PDF ", pdfStats);

    std::wcout << L"\nTotal documents scanned: "
               << FormatWithThousandsSeparator(totalDocsScanned)
               << L"\n";

    std::wcout << L"\nTotal execution time: "
               << FormatDuration(elapsed) << L"\n";

    if (cancelRequested)
        std::wcout << L"\nScan cancelled by user.\n";

    return 0;
}

D:\ Drive Results

C:\Users\Markus\Documents\Visual Studio 2010\Projects\CalcAvgSizeDocs\Release>calcavgsizedocs -s d
Scanning drive: d:\

Drive size: 1.09 TB

.DOCX: 125.60 KB average (230 files)
.XLSX: 321.00 KB average (47 files)
.PPTX: 11.63 MB average (11 files)
.PDF : 2.82 MB average (1625 files)

Total documents scanned: 1,913

Total execution time: 00:01:14.537

C:\ Drive Results (pro code output)

C:\Users\Markus\Documents\Visual Studio 2010\Projects\CalcAvgSizeDocs\Release>calcavgsizedocs -s c
Scanning drive: c:\

Drive size: 931.41 GB

Total Folders Processed: 221515


Complete Results:

.DOCX: 364.55 KB average (1782 files)
.XLSX: 109.95 KB average (565 files)
.PPTX: 2.05 MB average (205 files)
.PDF : 1.54 MB average (8179 files)

Total documents scanned: 10,731

Total execution time: 01:00:03.951


Vibe coded new solution 100x faster
C:\Users\Markus\Documents\Visual Studio 2010\Projects\avgsizedocs\Debug>avgsizedocs d:\
Scanning: d:\

Folders: 70013 | Files: 606999
----------------------------------------
Total Folders: 70,751
Total Files  : 607,842
Elapsed Time : 0h 1m 40s
----------------------------------------

.DOCX: Count=137, Total=47.00 MB, Avg=351.33 KB
.XLSX: Count=92, Total=53.95 MB, Avg=600.50 KB
.PPTX: Count=24, Total=23.31 MB, Avg=994.40 KB
.PDF : Count=1476, Total=2.85 GB, Avg=1.97 MB




Please post your results in comments.

No comments:

Post a Comment