init

2024-10-30 11:59:30 -04:00
commit 17031d8be8
8 changed files with 342 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
+/.idea/
+/node_modules/
+/dist/
--- a/README.md
+++ b/README.md
@@ -0,0 +1,136 @@
+# code-tokenizer-md
+
+Process git repository files into markdown with token counting and sensitive data redaction.
+
+## Overview
+
+`code-tokenizer-md` is a Node.js tool that processes git repository files, cleans code, redacts sensitive information, and generates markdown documentation with token counts.
+
+```mermaid
+graph TD
+   Start[Start] -->|Read| Git[Git Files]
+   Git -->|Clean| TC[TokenCleaner]
+   TC -->|Redact| Clean[Clean Code]
+   Clean -->|Generate| MD[Markdown]
+   MD -->|Count| Results[Token Counts]
+   style Start fill:#000000,stroke:#FFFFFF,stroke-width:4px,color:#ffffff
+   style Git fill:#222222,stroke:#FFFFFF,stroke-width:2px,color:#ffffff
+   style TC fill:#333333,stroke:#FFFFFF,stroke-width:2px,color:#ffffff
+   style Clean fill:#444444,stroke:#FFFFFF,stroke-width:2px,color:#ffffff
+   style MD fill:#555555,stroke:#FFFFFF,stroke-width:2px,color:#ffffff
+   style Results fill:#666666,stroke:#FFFFFF,stroke-width:2px,color:#ffffff
+```
+
+## Features
+
+### Data Processing
+- Reads files from git repository
+- Removes comments and unnecessary whitespace
+- Redacts sensitive information (API keys, tokens, etc.)
+- Counts tokens using llama3-tokenizer
+
+### Analysis Types
+- Token counting per file
+- Total token usage
+- File content analysis
+- Sensitive data detection
+
+### Data Presentation
+- Markdown formatted output
+- Code block formatting
+- Token count summaries
+- File organization hierarchy
+
+## Requirements
+
+- Node.js (>=14.0.0)
+- Git repository
+- npm or npx
+
+## Installation
+
+```shell
+npm install -g code-tokenizer-md
+```
+
+## Usage
+
+### Quick Start
+
+```shell
+npx code-tokenizer-md
+```
+
+### Programmatic Usage
+
+```javascript
+import { MarkdownGenerator } from 'code-tokenizer-md';
+
+const generator = new MarkdownGenerator({
+  dir: './project',
+  outputFilePath: './output.md'
+});
+
+const result = await generator.createMarkdownDocument();
+```
+
+## Project Structure
+
+```
+src/
+├── index.js              # Main exports
+├── TokenCleaner.js       # Code cleaning and redaction
+├── MarkdownGenerator.js  # Markdown generation logic
+└── cli.js               # CLI implementation
+```
+
+## Dependencies
+
+```json
+{
+  "dependencies": {
+    "llama3-tokenizer-js": "^1.0.0"
+  },
+  "peerDependencies": {
+    "node": ">=14.0.0"
+  }
+}
+```
+
+## Extending
+
+### Adding Custom Patterns
+
+```javascript
+const generator = new MarkdownGenerator({
+  customPatterns: [
+    { regex: /TODO:/g, replacement: '' }
+  ],
+  customSecretPatterns: [
+    { regex: /mySecret/g, replacement: '[REDACTED]' }
+  ]
+});
+```
+
+## Contributing
+
+1. Fork the repository
+2. Create a feature branch
+3. Commit your changes
+4. Push to the branch
+5. Open a Pull Request
+
+### Contribution Guidelines
+
+- Follow Node.js best practices
+- Include appropriate error handling
+- Add documentation for new features
+- Include tests for new functionality (this project needs a suite)
+- Update the README for significant changes
+
+## License
+MIT © 2024 Geoff Seemueller
+
+## Note
+
+This tool requires a git repository to function properly.
--- a/package.json
+++ b/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "code-tokenizer-md",
+  "version": "1.0.0",
+  "type": "module",
+  "main": "src/index.js",
+  "bin": {
+    "code-tokenizer-md": "./src/cli.js"
+  },
+  "dependencies": {
+    "llama3-tokenizer-js": "^1.0.0"
+  },
+  "peerDependencies": {
+    "node": ">=14.0.0"
+  }
+}
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -0,0 +1,32 @@
+lockfileVersion: '6.0'
+
+settings:
+  autoInstallPeers: true
+  excludeLinksFromLockfile: false
+
+dependencies:
+  llama3-tokenizer-js:
+    specifier: ^1.0.0
+    version: 1.2.0
+  node:
+    specifier: '>=14.0.0'
+    version: 22.11.0
+
+packages:
+
+  /llama3-tokenizer-js@1.2.0:
+    resolution: {integrity: sha512-oMgIgK958UlvoEm3Lz/gAj3QAKpnAMb6YqlY0aTYraSK/c+V3TF3P7IWFQJe4yjM60+2/KoK+EWziec6WQ57/g==}
+    dev: false
+
+  /node-bin-setup@1.1.3:
+    resolution: {integrity: sha512-opgw9iSCAzT2+6wJOETCpeRYAQxSopqQ2z+N6BXwIMsQQ7Zj5M8MaafQY8JMlolRR6R1UXg2WmhKp0p9lSOivg==}
+    dev: false
+
+  /node@22.11.0:
+    resolution: {integrity: sha512-RIAOdr40k1sq/DYF5u3XmhQHG+FZViuxObe2w1xPmOjEi4AiFgv/XRHW60YydS85X9gc8/jaI9RH4E9nvrV+gQ==}
+    engines: {npm: '>=5.0.0'}
+    hasBin: true
+    requiresBuild: true
+    dependencies:
+      node-bin-setup: 1.1.3
+    dev: false
--- a/src/MarkdownGenerator.js
+++ b/src/MarkdownGenerator.js
@@ -0,0 +1,98 @@
+
+// src/MarkdownGenerator.js
+import path from 'path';
+import { execSync } from 'child_process';
+import fs from 'fs/promises';
+import llama3Tokenizer from 'llama3-tokenizer-js';
+import { TokenCleaner } from './TokenCleaner.js';
+
+export class MarkdownGenerator {
+    constructor(options = {}) {
+        this.dir = options.dir || '.';
+        this.outputFilePath = options.outputFilePath || './prompt.md';
+        this.fileTypeExclusions = new Set(options.fileTypeExclusions || ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.tiff', '.lockb', '.yaml', '.ico', '.ttf', '.css']);
+        this.fileExclusions = options.fileExclusions || ['prompt.js', '.gitignore', '.env', '.dev.vars'];
+        this.tokenCleaner = new TokenCleaner(options.customPatterns, options.customSecretPatterns);
+        this.verbose = options.verbose ?? true;
+    }
+
+    async getTrackedFiles() {
+        try {
+            const output = this.execCommand('git ls-files');
+            const trackedFiles = output.split('\n').filter(file => file.length > 0);
+            if (this.verbose) console.log(`Total tracked files: ${trackedFiles.length}`);
+            return trackedFiles.filter(file => {
+                const fileExt = path.extname(file).toLowerCase();
+                const isExcluded = this.fileExclusions.some(pattern => this.isFileExcluded(file, pattern));
+                return !this.fileTypeExclusions.has(fileExt) && !isExcluded;
+            });
+        } catch (error) {
+            if (this.verbose) console.error('Error fetching tracked files:', error);
+            return [];
+        }
+    }
+
+    isFileExcluded(filePath, pattern) {
+        if (pattern.endsWith('/*')) {
+            const directory = pattern.slice(0, -2);
+            return filePath.startsWith(directory);
+        }
+        if (pattern.includes('/*')) {
+            const [directory, ext] = pattern.split('/*');
+            return filePath.startsWith(directory) && filePath.endsWith(ext);
+        }
+        return filePath === pattern;
+    }
+
+    async readFileContent(filePath) {
+        try {
+            const content = await fs.readFile(filePath, 'utf-8');
+            const cleanedAndRedactedContent = this.tokenCleaner.cleanAndRedact(content);
+            if (this.verbose) {
+                const tokenCount = llama3Tokenizer.encode(cleanedAndRedactedContent).length;
+                console.log(`${filePath}: Tokens[${tokenCount}]`);
+            }
+            return cleanedAndRedactedContent;
+        } catch (error) {
+            if (this.verbose) console.error(`Error reading file ${filePath}:`, error);
+            return '';
+        }
+    }
+
+    async generateMarkdown() {
+        const trackedFiles = await this.getTrackedFiles();
+        if (this.verbose) console.log(`Generating markdown for ${trackedFiles.length} files`);
+        let markdownContent = '# Project Files\n\n';
+
+        for (const file of trackedFiles) {
+            const content = await this.readFileContent(path.join(this.dir, file));
+            markdownContent += `## ${file}\n~~~\n${content.trim()}\n~~~\n`;
+        }
+        return markdownContent;
+    }
+
+    async createMarkdownDocument() {
+        try {
+            const markdownContent = await this.generateMarkdown();
+            await fs.writeFile(this.outputFilePath, markdownContent);
+            if (this.verbose) {
+                console.log(`Markdown document created at ${this.outputFilePath}`);
+                const totalTokens = llama3Tokenizer.encode(markdownContent).length;
+                console.log({total_tokens: totalTokens});
+            }
+            return { success: true, tokenCount: llama3Tokenizer.encode(markdownContent).length };
+        } catch (error) {
+            if (this.verbose) console.error('Error writing markdown document:', error);
+            return { success: false, error };
+        }
+    }
+
+    execCommand(command) {
+        try {
+            return execSync(command, { cwd: this.dir, encoding: 'utf-8' }).toString().trim();
+        } catch (error) {
+            if (this.verbose) console.error(`Error executing command: ${command}`, error);
+            throw error;
+        }
+    }
+}
--- a/src/TokenCleaner.js
+++ b/src/TokenCleaner.js
@@ -0,0 +1,41 @@
+// src/TokenCleaner.js
+export class TokenCleaner {
+    constructor(customPatterns = [], customSecretPatterns = []) {
+        this.patterns = [
+            { regex: /\/\/.*$/gm, replacement: '' },
+            { regex: /\/\*[\s\S]*?\*\//gm, replacement: '' },
+            { regex: /console\.(log|error|warn|info)\(.*?\);?/g, replacement: '' },
+            { regex: /^\s*[\r\n]/gm, replacement: '' },
+            { regex: / +$/gm, replacement: '' },
+            { regex: /^\s*import\s+.*?;?\s*$/gm, replacement: '' },
+            { regex: /^\s*\n+/gm, replacement: '\n' },
+            ...customPatterns
+        ];
+
+        this.secretPatterns = [
+            { regex: /(?<=(['"])(?:api[_-]?key|api[_-]?secret|access[_-]?token|auth[_-]?token|client[_-]?secret|password|secret[_-]?key|private[_-]?key)['"]:\s*['"])[^\'"]+(?=['"])/gi, replacement: '[REDACTED]' },
+            { regex: /(?<=(?:api[_-]?key|api[_-]?secret|access[_-]?token|auth[_-]?token|client[_-]?secret|password|secret[_-]?key|private[_-]?key)\s*=\s*['"])[^\'"]+(?=['"])/gi, replacement: '[REDACTED]' },
+            { regex: /(?<=bearer\s+)[a-zA-Z0-9\-._~+\/]+=*/gi, replacement: '[REDACTED]' },
+            { regex: /(?<=Authorization:\s*Bearer\s+)[a-zA-Z0-9\-._~+\/]+=*/gi, replacement: '[REDACTED]' },
+            { regex: /(?<=eyJ)[A-Za-z0-9-_=]+\.eyJ[A-Za-z0-9-_=]+\.[A-Za-z0-9-_.+\/=]*/g, replacement: '[REDACTED_JWT]' },
+            { regex: /([a-f0-9]{40}|[a-f0-9]{64})/gi, replacement: '[REDACTED_HASH]' },
+            { regex: /(?<=[^A-Za-z0-9]|^)([A-Za-z0-9+\/]{40}|[A-Za-z0-9+\/]{64})(?=[^A-Za-z0-9]|$)/g, replacement: '[REDACTED_BASE64]' },
+            ...customSecretPatterns
+        ];
+    }
+
+    clean(code) {
+        return this.patterns.reduce((cleanCode, pattern) =>
+            cleanCode.replace(pattern.regex, pattern.replacement), code);
+    }
+
+    redactSecrets(code) {
+        return this.secretPatterns.reduce((redactedCode, pattern) =>
+            redactedCode.replace(pattern.regex, pattern.replacement), code);
+    }
+
+    cleanAndRedact(code) {
+        const cleanedCode = this.clean(code);
+        return this.redactSecrets(cleanedCode);
+    }
+}
--- a/src/cli.js
+++ b/src/cli.js
@@ -0,0 +1,14 @@
+#!/usr/bin/env node
+import { MarkdownGenerator } from './MarkdownGenerator.js';
+
+const generator = new MarkdownGenerator();
+generator.createMarkdownDocument()
+    .then(result => {
+        if (!result.success) {
+            process.exit(1);
+        }
+    })
+    .catch(error => {
+        console.error('Error:', error);
+        process.exit(1);
+    });
--- a/src/index.js
+++ b/src/index.js
@@ -0,0 +1,3 @@
+// src/index.js
+export { TokenCleaner } from './TokenCleaner.js';
+export { MarkdownGenerator } from './MarkdownGenerator.js';