init
This commit is contained in:
88
packages/genaiscript/genaisrc/web-scrape.genai.mts
Normal file
88
packages/genaiscript/genaisrc/web-scrape.genai.mts
Normal file
@@ -0,0 +1,88 @@
|
||||
import {Window} from 'happy-dom';
|
||||
import {platform} from 'os';
|
||||
|
||||
script({
|
||||
title: "scrape",
|
||||
cache: false,
|
||||
});
|
||||
|
||||
/*
|
||||
"url": "Full URL in the conversation that references the URL being interacted with. No trailing slash!",
|
||||
"query": "Implied question about the resources at the URL.",
|
||||
"action": "read | scrape | crawl"
|
||||
*/
|
||||
|
||||
try {
|
||||
const {url, query, action} = JSON.parse(env.vars.user_input);
|
||||
} catch (e) {
|
||||
throw "Sorry! Something went wrong.";
|
||||
}
|
||||
|
||||
const {url, query, action} = JSON.parse(env.vars.user_input);
|
||||
|
||||
def("URL", url);
|
||||
|
||||
def("QUERY", query);
|
||||
|
||||
def("ACTION", action);
|
||||
|
||||
// console.log({url, query, action});
|
||||
|
||||
if(!(new URL(url) ?? undefined)) {
|
||||
throw "Bad URL. Maybe try again?"
|
||||
}
|
||||
|
||||
function getBrowser(): "webkit" | "chromium" | "firefox" {
|
||||
if (platform() === 'darwin') {
|
||||
return "webkit"; // macOS is identified by 'darwin'
|
||||
}
|
||||
return "chromium"; // default to chromium for other platforms
|
||||
}
|
||||
|
||||
const {text} = await host.fetchText(new URL(url).toString());
|
||||
|
||||
// const browser = getBrowser();
|
||||
|
||||
// const page = await host.browse(new URL(url).toString(), {
|
||||
// browser: getBrowser(),
|
||||
// headless: true,
|
||||
// javaScriptEnabled: browser !== "chromium",
|
||||
// // timeout: 3000,
|
||||
// // bypassCSP: true,
|
||||
// // baseUrl: new URL(url).origin,
|
||||
// });
|
||||
//
|
||||
// const html = (await page.content());
|
||||
// const title = (await page.title());
|
||||
|
||||
// console.log({html});
|
||||
|
||||
const window = new Window({
|
||||
// url: "http://localhost:8080",
|
||||
height: 1920,
|
||||
width: 1080,
|
||||
settings: {
|
||||
navigator: {
|
||||
userAgent: 'Mozilla/5.0 (compatible; GeoffsAI/1.0; +https://geoff.seemueller.io)',
|
||||
},
|
||||
}
|
||||
});
|
||||
|
||||
window.document.body.innerHTML = text;
|
||||
|
||||
const textContent = window.document.body.textContent;
|
||||
|
||||
def("PAGE_TEXT", textContent);
|
||||
|
||||
$`You a helpful assistant interacting with resources found at the URL.
|
||||
|
||||
- markdown table is concise representation of PAGE_TEXT relevant to the QUERY
|
||||
|
||||
### Respond Example:
|
||||
### Data from ${url}:
|
||||
| Header 1 | Header 2 | Header 3 |
|
||||
|----------|----------|----------|
|
||||
| Data 1 | Data 2 | Data 3 |
|
||||
\n---[Example explanation of data significance to query.]
|
||||
---
|
||||
Respond with the markdown table and an explanation of significance. Do not include extra text.`;
|
Reference in New Issue
Block a user