enables deep research

2025-05-28 19:43:36 -04:00
parent 805b524471
commit 2e45c6b67b
14 changed files with 245 additions and 59 deletions
--- a/packages/genaiscript/genaisrc/deep-research.genai.mts
+++ b/packages/genaiscript/genaisrc/deep-research.genai.mts
@@ -1,7 +1,7 @@
-import {task, entrypoint, interrupt, MemorySaver} from "@langchain/langgraph"
+import {entrypoint, InMemoryStore, MemorySaver, task} from "@langchain/langgraph"
 import "./tools/searxng.genai.mjs"
 import {SearxngClient} from "@agentic/searxng";
-
+import ky from "ky";

 script({
    title: "Deep Research Program",
@@ -15,6 +15,7 @@ const {output, vars} = env
 const breakdownResearch = task(
    "breakdown_research",
    async (question: string) => {
+        output.log("Breaking down question:", question);
        const result = await runPrompt(
            async (ctx) => {
                ctx.$`You are an expert research strategist.
@@ -51,39 +52,44 @@ Output the breakdown as a JSON object.`
                },
            }
        )
-
+        output.fence(result.json, "json");
        return result.json
    }
 )

-const globalCtx = this;
-

 const researchSubQuestion = task(
    "research_subquestion",
    async (subQuestion: { id: string; question: string }) => {
+        output.log(`Researching sub-question: ${subQuestion.question}`);
+        const kyWithHeaders = ky.create({
+            referrerPolicy: "unsafe-url",

-        const searxng = new SearxngClient({apiBaseUrl: "https://search-engine-gsio.fly.dev"});
+            headers: {
+                'Authorization': 'Basic ' + btoa(`admin:${process.env.SEARXNG_PASSWORD}`),
+            }
+        });

-        const {text} = await runPrompt(
+        const searxng = new SearxngClient({ky: kyWithHeaders});
+
+        const {json} = await runPrompt(
            (_) => {
                _.defTool(searxng)
                _.$`You are an expert researcher with access to comprehensive information.
    
-Task: Thoroughly research the following question and provide a detailed answer.
+Task: Thoroughly research the following question and create a JSON formatted response.

 Question ID: ${subQuestion.id}
 Question: ${subQuestion.question}

-Provide your findings in a structured format that includes:
- Your answer to the sub-question
- Relevant sources that support your answer
- Your confidence level in the answer (0-1)`
+Respond with the specified JSON format.
+`
            },
            {
                model: "small",
                label: `research subquestion ${subQuestion.id}`,
                maxDataRepairs: 2,
+                responseType: "json_object",
                responseSchema: {
                    type: "object",
                    properties: {
@@ -105,19 +111,20 @@ Provide your findings in a structured format that includes:
                },
            }
        )
-        return text
+        output.fence(json, "json");
+        return json
    }
 )

-
 const synthesizeFindings = task(
    "synthesize_findings",
    async (mainQuestion: string, findings: any[]) => {
+        output.log(`Synthesizing Findings: ${JSON.stringify(findings, null, 2)}`);
        const result = await runPrompt(
            async (ctx) => {
                ctx.$`You are an expert research synthesizer.
    
-Task: Synthesize the following research findings into a coherent response to the main research question.
+Task: Synthesize the following research findings into a JSON object to answer the main research question.

 Main Research Question: ${mainQuestion}

@@ -128,11 +135,14 @@ Provide a synthesis that:
 1. Directly answers the main research question
 2. Integrates the findings from all sub-questions
 3. Identifies limitations in the current research
-4. Suggests next steps for further investigation`
+4. Suggests next steps for further investigation
+
+Respond in the specified JSON format.`
            },
            {
                label: "synthesize findings",
-                responseType: "markdown",
+                responseType: "json_object",
+                maxDataRepairs: 2,
                responseSchema: {
                    type: "object",
                    properties: {
@@ -147,15 +157,15 @@ Provide a synthesis that:
                },
            }
        )
-
+        output.fence(result.json, "json");
        return result.json
    }
 )

-
 const summarizeAndIdentifyGaps = task(
-    "summarize_and_identify_gaps",
+    {name: "summarize_and_identify_gaps"},
    async (synthesis: any, findings: any[]) => {
+        output.log(`Summarizing and identifying gaps: ${JSON.stringify(findings, null, 2)}`);
        const result = await runPrompt(
            async (ctx) => {
                ctx.$`You are an expert research evaluator.
@@ -171,10 +181,13 @@ ${JSON.stringify(findings, null, 2)}
 Please provide:
 1. A concise summary of current findings
 2. Identify 2-3 specific knowledge gaps
-3. Formulate follow-up questions to address these gaps`
+3. Formulate follow-up questions to address these gaps
+
+Respond using the specified JSON schema.`
            },
            {
                label: "identify research gaps",
+                maxDataRepairs: 2,
                responseSchema: {
                    type: "object",
                    properties: {
@@ -197,27 +210,28 @@ Please provide:
                },
            }
        )
+        output.fence(result.json, "json");
        return result.json
    }
 )

-
+// Research Workflow
 const researchWorkflow = entrypoint(
-    {checkpointer: new MemorySaver(), name: "research_workflow"},
+    {checkpointer: new MemorySaver(), name: "research_workflow", store: new InMemoryStore() },
    async (input: { question: string; context?: string }) => {
-
+        output.log(`Deep research initiated`);
+        // Step 1: Break down the research question
        const breakdown = await breakdownResearch(input.question)


+        // Step 2: Research each sub-question in parallel
        const subQuestionFindings = []
+        // handle both subQuestions and sub_questions, since the API returns one or the other
+        const subquestions = breakdown?.sub_questions ? breakdown.sub_questions : breakdown.subQuestions;
+        const forSq = await Promise.all(subquestions.map(async (q) => await researchSubQuestion(q)));
+        forSq.map(subQuestionFindings.push)

-        for (const sq of breakdown.subQuestions) {
-            const analysis = await researchSubQuestion(sq);
-            console.log(analysis);
-            subQuestionFindings.push(analysis);
-        }
-
-
+        // Step 3: Synthesize the findings
        let synthesis = await synthesizeFindings(
            input.question,
            subQuestionFindings
@@ -228,22 +242,21 @@ const researchWorkflow = entrypoint(
            subQuestionFindings
        )

-
+        // Step 5: Conduct follow-up research on identified gaps
        const followUpFindings = [];
        for (const fq of gapAnalysis.followUpQuestions) {
            const anwser = await researchSubQuestion(fq);
-            console.log(anwser);
            followUpFindings.push(anwser);
        }


+        // Step 6: Final synthesis with deep research
        const allFindings = [...subQuestionFindings, ...followUpFindings]
        const finalSynthesis = await synthesizeFindings(
            input.question,
            allFindings
        )

-
        return {
            question: input.question,
            breakdown: breakdown,
@@ -255,27 +268,26 @@ const researchWorkflow = entrypoint(
    }
 )

-
-const researchQuestion =
-    env.vars.question ||
-    "What are the most promising approaches to climate change mitigation?"
-
-
+// An arbitrary ID locked to this workflow run
 const threadId = `research-${Date.now()}`

+const options = {
+    configurable: {thread_id: threadId},
+};

-const config = {
-    configurable: {
-        thread_id: threadId,
-    },
-}
+const researchQuestion = env.vars.user_input;

+const inputs =  {
+    question: researchQuestion,
+    context: vars.context || "",
+};

+// Execute workflow. Checkpoints are manually sent across the wire in the tasks.
 const results = await researchWorkflow.invoke(
+    inputs,
    {
-        question: researchQuestion,
-        context: vars.context || "",
-    },
-    config
+        ...options,
+    }
 )
-output.fence(results, "json")
+
+env.output.fence(results)
--- a/packages/genaiscript/package.json
+++ b/packages/genaiscript/package.json
@@ -9,13 +9,16 @@
    "shim:ai:search": "pnpm build && ./dist/shim.js --file=genaisrc/search.genai.mts USER_INPUT=\"Who won the 2024 presidential election?\"\n",
    "ai:news": "genaiscript run genaisrc/news-search.genai.mts --vars USER_INPUT='What are the latest updates and developments in the Ukraine war?'",
    "ai:url:read": "genaiscript run genaisrc/web-scrape.genai.mts --vars USER_INPUT='{\"url\":\"https://geoff.seemueller.io/about\",\"query\":\"Describe the details of the page.\", \"action\": \"read\"}'",
-    "ai:url:scrape": "npx genaiscript run genaisrc/web-scrape.genai.mts --vars USER_INPUT='{\"url\":\"https://www.time4learning.com/homeschool-curriculum/high-school/eleventh-grade/math.html\",\"query\":\"What is on this page?\", \"action\": \"scrape\"}'"
+    "ai:url:scrape": "npx genaiscript run genaisrc/web-scrape.genai.mts --vars USER_INPUT='{\"url\":\"https://www.time4learning.com/homeschool-curriculum/high-school/eleventh-grade/math.html\",\"query\":\"What is on this page?\", \"action\": \"scrape\"}'",
+    "typegen": " genaiscript scripts compile"
  },
  "dependencies": {
    "@agentic/searxng": "7.5.3",
    "@types/node": "^22.10.2",
    "genaiscript": "^1.95.1",
    "happy-dom": "^16.0.1",
-    "ky": "^1.8.0"
+    "ky": "^1.8.0",
+    "md-front-matter": "^1.0.4",
+    "@langchain/langgraph":"0.2.57"
  }
 }