init
This commit is contained in:
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
/.idea/
|
||||||
|
/node_modules/
|
||||||
|
/dist/
|
136
README.md
Normal file
136
README.md
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
# code-tokenizer-md
|
||||||
|
|
||||||
|
Process git repository files into markdown with token counting and sensitive data redaction.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
`code-tokenizer-md` is a Node.js tool that processes git repository files, cleans code, redacts sensitive information, and generates markdown documentation with token counts.
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
graph TD
|
||||||
|
Start[Start] -->|Read| Git[Git Files]
|
||||||
|
Git -->|Clean| TC[TokenCleaner]
|
||||||
|
TC -->|Redact| Clean[Clean Code]
|
||||||
|
Clean -->|Generate| MD[Markdown]
|
||||||
|
MD -->|Count| Results[Token Counts]
|
||||||
|
style Start fill:#000000,stroke:#FFFFFF,stroke-width:4px,color:#ffffff
|
||||||
|
style Git fill:#222222,stroke:#FFFFFF,stroke-width:2px,color:#ffffff
|
||||||
|
style TC fill:#333333,stroke:#FFFFFF,stroke-width:2px,color:#ffffff
|
||||||
|
style Clean fill:#444444,stroke:#FFFFFF,stroke-width:2px,color:#ffffff
|
||||||
|
style MD fill:#555555,stroke:#FFFFFF,stroke-width:2px,color:#ffffff
|
||||||
|
style Results fill:#666666,stroke:#FFFFFF,stroke-width:2px,color:#ffffff
|
||||||
|
```
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
### Data Processing
|
||||||
|
- Reads files from git repository
|
||||||
|
- Removes comments and unnecessary whitespace
|
||||||
|
- Redacts sensitive information (API keys, tokens, etc.)
|
||||||
|
- Counts tokens using llama3-tokenizer
|
||||||
|
|
||||||
|
### Analysis Types
|
||||||
|
- Token counting per file
|
||||||
|
- Total token usage
|
||||||
|
- File content analysis
|
||||||
|
- Sensitive data detection
|
||||||
|
|
||||||
|
### Data Presentation
|
||||||
|
- Markdown formatted output
|
||||||
|
- Code block formatting
|
||||||
|
- Token count summaries
|
||||||
|
- File organization hierarchy
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Node.js (>=14.0.0)
|
||||||
|
- Git repository
|
||||||
|
- npm or npx
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```shell
|
||||||
|
npm install -g code-tokenizer-md
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Quick Start
|
||||||
|
|
||||||
|
```shell
|
||||||
|
npx code-tokenizer-md
|
||||||
|
```
|
||||||
|
|
||||||
|
### Programmatic Usage
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
import { MarkdownGenerator } from 'code-tokenizer-md';
|
||||||
|
|
||||||
|
const generator = new MarkdownGenerator({
|
||||||
|
dir: './project',
|
||||||
|
outputFilePath: './output.md'
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await generator.createMarkdownDocument();
|
||||||
|
```
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
src/
|
||||||
|
├── index.js # Main exports
|
||||||
|
├── TokenCleaner.js # Code cleaning and redaction
|
||||||
|
├── MarkdownGenerator.js # Markdown generation logic
|
||||||
|
└── cli.js # CLI implementation
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"dependencies": {
|
||||||
|
"llama3-tokenizer-js": "^1.0.0"
|
||||||
|
},
|
||||||
|
"peerDependencies": {
|
||||||
|
"node": ">=14.0.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Extending
|
||||||
|
|
||||||
|
### Adding Custom Patterns
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
const generator = new MarkdownGenerator({
|
||||||
|
customPatterns: [
|
||||||
|
{ regex: /TODO:/g, replacement: '' }
|
||||||
|
],
|
||||||
|
customSecretPatterns: [
|
||||||
|
{ regex: /mySecret/g, replacement: '[REDACTED]' }
|
||||||
|
]
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
1. Fork the repository
|
||||||
|
2. Create a feature branch
|
||||||
|
3. Commit your changes
|
||||||
|
4. Push to the branch
|
||||||
|
5. Open a Pull Request
|
||||||
|
|
||||||
|
### Contribution Guidelines
|
||||||
|
|
||||||
|
- Follow Node.js best practices
|
||||||
|
- Include appropriate error handling
|
||||||
|
- Add documentation for new features
|
||||||
|
- Include tests for new functionality (this project needs a suite)
|
||||||
|
- Update the README for significant changes
|
||||||
|
|
||||||
|
## License
|
||||||
|
MIT © 2024 Geoff Seemueller
|
||||||
|
|
||||||
|
## Note
|
||||||
|
|
||||||
|
This tool requires a git repository to function properly.
|
15
package.json
Normal file
15
package.json
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"name": "code-tokenizer-md",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"type": "module",
|
||||||
|
"main": "src/index.js",
|
||||||
|
"bin": {
|
||||||
|
"code-tokenizer-md": "./src/cli.js"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"llama3-tokenizer-js": "^1.0.0"
|
||||||
|
},
|
||||||
|
"peerDependencies": {
|
||||||
|
"node": ">=14.0.0"
|
||||||
|
}
|
||||||
|
}
|
32
pnpm-lock.yaml
generated
Normal file
32
pnpm-lock.yaml
generated
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
lockfileVersion: '6.0'
|
||||||
|
|
||||||
|
settings:
|
||||||
|
autoInstallPeers: true
|
||||||
|
excludeLinksFromLockfile: false
|
||||||
|
|
||||||
|
dependencies:
|
||||||
|
llama3-tokenizer-js:
|
||||||
|
specifier: ^1.0.0
|
||||||
|
version: 1.2.0
|
||||||
|
node:
|
||||||
|
specifier: '>=14.0.0'
|
||||||
|
version: 22.11.0
|
||||||
|
|
||||||
|
packages:
|
||||||
|
|
||||||
|
/llama3-tokenizer-js@1.2.0:
|
||||||
|
resolution: {integrity: sha512-oMgIgK958UlvoEm3Lz/gAj3QAKpnAMb6YqlY0aTYraSK/c+V3TF3P7IWFQJe4yjM60+2/KoK+EWziec6WQ57/g==}
|
||||||
|
dev: false
|
||||||
|
|
||||||
|
/node-bin-setup@1.1.3:
|
||||||
|
resolution: {integrity: sha512-opgw9iSCAzT2+6wJOETCpeRYAQxSopqQ2z+N6BXwIMsQQ7Zj5M8MaafQY8JMlolRR6R1UXg2WmhKp0p9lSOivg==}
|
||||||
|
dev: false
|
||||||
|
|
||||||
|
/node@22.11.0:
|
||||||
|
resolution: {integrity: sha512-RIAOdr40k1sq/DYF5u3XmhQHG+FZViuxObe2w1xPmOjEi4AiFgv/XRHW60YydS85X9gc8/jaI9RH4E9nvrV+gQ==}
|
||||||
|
engines: {npm: '>=5.0.0'}
|
||||||
|
hasBin: true
|
||||||
|
requiresBuild: true
|
||||||
|
dependencies:
|
||||||
|
node-bin-setup: 1.1.3
|
||||||
|
dev: false
|
98
src/MarkdownGenerator.js
Normal file
98
src/MarkdownGenerator.js
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
|
||||||
|
// src/MarkdownGenerator.js
|
||||||
|
import path from 'path';
|
||||||
|
import { execSync } from 'child_process';
|
||||||
|
import fs from 'fs/promises';
|
||||||
|
import llama3Tokenizer from 'llama3-tokenizer-js';
|
||||||
|
import { TokenCleaner } from './TokenCleaner.js';
|
||||||
|
|
||||||
|
export class MarkdownGenerator {
|
||||||
|
constructor(options = {}) {
|
||||||
|
this.dir = options.dir || '.';
|
||||||
|
this.outputFilePath = options.outputFilePath || './prompt.md';
|
||||||
|
this.fileTypeExclusions = new Set(options.fileTypeExclusions || ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.tiff', '.lockb', '.yaml', '.ico', '.ttf', '.css']);
|
||||||
|
this.fileExclusions = options.fileExclusions || ['prompt.js', '.gitignore', '.env', '.dev.vars'];
|
||||||
|
this.tokenCleaner = new TokenCleaner(options.customPatterns, options.customSecretPatterns);
|
||||||
|
this.verbose = options.verbose ?? true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async getTrackedFiles() {
|
||||||
|
try {
|
||||||
|
const output = this.execCommand('git ls-files');
|
||||||
|
const trackedFiles = output.split('\n').filter(file => file.length > 0);
|
||||||
|
if (this.verbose) console.log(`Total tracked files: ${trackedFiles.length}`);
|
||||||
|
return trackedFiles.filter(file => {
|
||||||
|
const fileExt = path.extname(file).toLowerCase();
|
||||||
|
const isExcluded = this.fileExclusions.some(pattern => this.isFileExcluded(file, pattern));
|
||||||
|
return !this.fileTypeExclusions.has(fileExt) && !isExcluded;
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
if (this.verbose) console.error('Error fetching tracked files:', error);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
isFileExcluded(filePath, pattern) {
|
||||||
|
if (pattern.endsWith('/*')) {
|
||||||
|
const directory = pattern.slice(0, -2);
|
||||||
|
return filePath.startsWith(directory);
|
||||||
|
}
|
||||||
|
if (pattern.includes('/*')) {
|
||||||
|
const [directory, ext] = pattern.split('/*');
|
||||||
|
return filePath.startsWith(directory) && filePath.endsWith(ext);
|
||||||
|
}
|
||||||
|
return filePath === pattern;
|
||||||
|
}
|
||||||
|
|
||||||
|
async readFileContent(filePath) {
|
||||||
|
try {
|
||||||
|
const content = await fs.readFile(filePath, 'utf-8');
|
||||||
|
const cleanedAndRedactedContent = this.tokenCleaner.cleanAndRedact(content);
|
||||||
|
if (this.verbose) {
|
||||||
|
const tokenCount = llama3Tokenizer.encode(cleanedAndRedactedContent).length;
|
||||||
|
console.log(`${filePath}: Tokens[${tokenCount}]`);
|
||||||
|
}
|
||||||
|
return cleanedAndRedactedContent;
|
||||||
|
} catch (error) {
|
||||||
|
if (this.verbose) console.error(`Error reading file ${filePath}:`, error);
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async generateMarkdown() {
|
||||||
|
const trackedFiles = await this.getTrackedFiles();
|
||||||
|
if (this.verbose) console.log(`Generating markdown for ${trackedFiles.length} files`);
|
||||||
|
let markdownContent = '# Project Files\n\n';
|
||||||
|
|
||||||
|
for (const file of trackedFiles) {
|
||||||
|
const content = await this.readFileContent(path.join(this.dir, file));
|
||||||
|
markdownContent += `## ${file}\n~~~\n${content.trim()}\n~~~\n`;
|
||||||
|
}
|
||||||
|
return markdownContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
async createMarkdownDocument() {
|
||||||
|
try {
|
||||||
|
const markdownContent = await this.generateMarkdown();
|
||||||
|
await fs.writeFile(this.outputFilePath, markdownContent);
|
||||||
|
if (this.verbose) {
|
||||||
|
console.log(`Markdown document created at ${this.outputFilePath}`);
|
||||||
|
const totalTokens = llama3Tokenizer.encode(markdownContent).length;
|
||||||
|
console.log({total_tokens: totalTokens});
|
||||||
|
}
|
||||||
|
return { success: true, tokenCount: llama3Tokenizer.encode(markdownContent).length };
|
||||||
|
} catch (error) {
|
||||||
|
if (this.verbose) console.error('Error writing markdown document:', error);
|
||||||
|
return { success: false, error };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
execCommand(command) {
|
||||||
|
try {
|
||||||
|
return execSync(command, { cwd: this.dir, encoding: 'utf-8' }).toString().trim();
|
||||||
|
} catch (error) {
|
||||||
|
if (this.verbose) console.error(`Error executing command: ${command}`, error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
41
src/TokenCleaner.js
Normal file
41
src/TokenCleaner.js
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
// src/TokenCleaner.js
|
||||||
|
export class TokenCleaner {
|
||||||
|
constructor(customPatterns = [], customSecretPatterns = []) {
|
||||||
|
this.patterns = [
|
||||||
|
{ regex: /\/\/.*$/gm, replacement: '' },
|
||||||
|
{ regex: /\/\*[\s\S]*?\*\//gm, replacement: '' },
|
||||||
|
{ regex: /console\.(log|error|warn|info)\(.*?\);?/g, replacement: '' },
|
||||||
|
{ regex: /^\s*[\r\n]/gm, replacement: '' },
|
||||||
|
{ regex: / +$/gm, replacement: '' },
|
||||||
|
{ regex: /^\s*import\s+.*?;?\s*$/gm, replacement: '' },
|
||||||
|
{ regex: /^\s*\n+/gm, replacement: '\n' },
|
||||||
|
...customPatterns
|
||||||
|
];
|
||||||
|
|
||||||
|
this.secretPatterns = [
|
||||||
|
{ regex: /(?<=(['"])(?:api[_-]?key|api[_-]?secret|access[_-]?token|auth[_-]?token|client[_-]?secret|password|secret[_-]?key|private[_-]?key)['"]:\s*['"])[^\'"]+(?=['"])/gi, replacement: '[REDACTED]' },
|
||||||
|
{ regex: /(?<=(?:api[_-]?key|api[_-]?secret|access[_-]?token|auth[_-]?token|client[_-]?secret|password|secret[_-]?key|private[_-]?key)\s*=\s*['"])[^\'"]+(?=['"])/gi, replacement: '[REDACTED]' },
|
||||||
|
{ regex: /(?<=bearer\s+)[a-zA-Z0-9\-._~+\/]+=*/gi, replacement: '[REDACTED]' },
|
||||||
|
{ regex: /(?<=Authorization:\s*Bearer\s+)[a-zA-Z0-9\-._~+\/]+=*/gi, replacement: '[REDACTED]' },
|
||||||
|
{ regex: /(?<=eyJ)[A-Za-z0-9-_=]+\.eyJ[A-Za-z0-9-_=]+\.[A-Za-z0-9-_.+\/=]*/g, replacement: '[REDACTED_JWT]' },
|
||||||
|
{ regex: /([a-f0-9]{40}|[a-f0-9]{64})/gi, replacement: '[REDACTED_HASH]' },
|
||||||
|
{ regex: /(?<=[^A-Za-z0-9]|^)([A-Za-z0-9+\/]{40}|[A-Za-z0-9+\/]{64})(?=[^A-Za-z0-9]|$)/g, replacement: '[REDACTED_BASE64]' },
|
||||||
|
...customSecretPatterns
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
clean(code) {
|
||||||
|
return this.patterns.reduce((cleanCode, pattern) =>
|
||||||
|
cleanCode.replace(pattern.regex, pattern.replacement), code);
|
||||||
|
}
|
||||||
|
|
||||||
|
redactSecrets(code) {
|
||||||
|
return this.secretPatterns.reduce((redactedCode, pattern) =>
|
||||||
|
redactedCode.replace(pattern.regex, pattern.replacement), code);
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanAndRedact(code) {
|
||||||
|
const cleanedCode = this.clean(code);
|
||||||
|
return this.redactSecrets(cleanedCode);
|
||||||
|
}
|
||||||
|
}
|
14
src/cli.js
Executable file
14
src/cli.js
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
#!/usr/bin/env node
|
||||||
|
import { MarkdownGenerator } from './MarkdownGenerator.js';
|
||||||
|
|
||||||
|
const generator = new MarkdownGenerator();
|
||||||
|
generator.createMarkdownDocument()
|
||||||
|
.then(result => {
|
||||||
|
if (!result.success) {
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error('Error:', error);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
3
src/index.js
Normal file
3
src/index.js
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
// src/index.js
|
||||||
|
export { TokenCleaner } from './TokenCleaner.js';
|
||||||
|
export { MarkdownGenerator } from './MarkdownGenerator.js';
|
Reference in New Issue
Block a user