diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index e6bd62f9..c6dd653b 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -80,7 +80,9 @@ export default class Interpreter extends EventEmitter { private serializableDataByType: Record> = { scrapeList: {}, - scrapeSchema: {} + scrapeSchema: {}, + crawl: {}, + search: {} }; private scrapeListCounter: number = 0; @@ -570,7 +572,9 @@ export default class Interpreter extends EventEmitter { await this.options.serializableCallback({ scrapeList: this.serializableDataByType.scrapeList, - scrapeSchema: this.serializableDataByType.scrapeSchema + scrapeSchema: this.serializableDataByType.scrapeSchema, + crawl: this.serializableDataByType.crawl || {}, + search: this.serializableDataByType.search || {} }); }, @@ -708,6 +712,750 @@ export default class Interpreter extends EventEmitter { } }, + crawl: async (crawlConfig: { + mode: 'domain' | 'subdomain' | 'path'; + limit: number; + maxDepth: number; + includePaths: string[]; + excludePaths: string[]; + useSitemap: boolean; + followLinks: boolean; + respectRobots: boolean; + }) => { + if (this.isAborted) { + this.log('Workflow aborted, stopping crawl', Level.WARN); + return; + } + + + if (this.options.debugChannel?.setActionType) { + this.options.debugChannel.setActionType('crawl'); + } + + this.log('Starting crawl operation', Level.LOG); + + try { + const currentUrl = page.url(); + this.log(`Current page URL: ${currentUrl}`, Level.LOG); + + if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') { + this.log('Page not yet navigated, waiting for navigation...', Level.WARN); + await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {}); + } + + const baseUrl = page.url(); + this.log(`Using base URL for crawl: ${baseUrl}`, Level.LOG); + + const parsedBase = new URL(baseUrl); + const baseDomain = parsedBase.hostname; + + let discoveredUrls: string[] = []; + + if (crawlConfig.useSitemap) { + this.log('Fetching sitemap URLs...', Level.LOG); + try { + const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`; + + const sitemapUrls = await page.evaluate((url) => { + return new Promise((resolve) => { + const xhr = new XMLHttpRequest(); + xhr.open('GET', url, true); + xhr.onload = function() { + if (xhr.status === 200) { + const text = xhr.responseText; + const locMatches = text.match(/(.*?)<\/loc>/g) || []; + const urls = locMatches.map(match => match.replace(/<\/?loc>/g, '')); + resolve(urls); + } else { + resolve([]); + } + }; + xhr.onerror = function() { + resolve([]); + }; + xhr.send(); + }); + }, sitemapUrl); + + if (sitemapUrls.length > 0) { + const nestedSitemaps = sitemapUrls.filter(url => + url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/') + ); + const regularUrls = sitemapUrls.filter(url => + !url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/') + ); + + discoveredUrls.push(...regularUrls); + this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, Level.LOG); + + for (const nestedUrl of nestedSitemaps.slice(0, 10)) { + try { + this.log(`Fetching nested sitemap: ${nestedUrl}`, Level.LOG); + const nestedUrls = await page.evaluate((url) => { + return new Promise((resolve) => { + const xhr = new XMLHttpRequest(); + xhr.open('GET', url, true); + xhr.onload = function() { + if (xhr.status === 200) { + const text = xhr.responseText; + const locMatches = text.match(/(.*?)<\/loc>/g) || []; + const urls = locMatches.map(match => match.replace(/<\/?loc>/g, '')); + resolve(urls); + } else { + resolve([]); + } + }; + xhr.onerror = function() { + resolve([]); + }; + xhr.send(); + }); + }, nestedUrl); + + if (nestedUrls.length > 0) { + discoveredUrls.push(...nestedUrls); + this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, Level.LOG); + } + } catch (error) { + this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, Level.WARN); + } + } + + this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, Level.LOG); + } else { + this.log('No URLs found in sitemap or sitemap not available', Level.WARN); + } + } catch (error) { + this.log(`Sitemap fetch failed: ${error.message}`, Level.WARN); + } + } + + if (crawlConfig.followLinks) { + this.log('Extracting links from current page...', Level.LOG); + try { + await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {}); + + await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { + this.log('Network did not become idle, continuing anyway', Level.WARN); + }); + + await new Promise(resolve => setTimeout(resolve, 5000)); + + const anchorCount = await page.evaluate(() => { + return document.querySelectorAll('a').length; + }); + this.log(`Page has ${anchorCount} total anchor tags`, Level.LOG); + + const pageLinks = await page.evaluate(() => { + const links: string[] = []; + const allAnchors = document.querySelectorAll('a'); + console.log('Total anchors found:', allAnchors.length); + + for (let i = 0; i < allAnchors.length; i++) { + const anchor = allAnchors[i] as HTMLAnchorElement; + const href = anchor.getAttribute('href'); + const fullHref = anchor.href; + + if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) { + links.push(fullHref); + } + } + + console.log('Links extracted:', links.length); + return links; + }); + + discoveredUrls.push(...pageLinks); + this.log(`Found ${pageLinks.length} links from page`, Level.LOG); + } catch (error) { + this.log(`Link extraction failed: ${error.message}`, Level.WARN); + } + } + + const filteredUrls = discoveredUrls.filter(url => { + try { + const urlObj = new URL(url); + + if (crawlConfig.mode === 'domain') { + if (urlObj.hostname !== baseDomain) return false; + } else if (crawlConfig.mode === 'subdomain') { + if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain) return false; + } else if (crawlConfig.mode === 'path') { + if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname)) return false; + } + + if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) { + const matches = crawlConfig.includePaths.some(pattern => { + const regex = new RegExp(pattern); + return regex.test(url); + }); + if (!matches) return false; + } + + if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) { + const matches = crawlConfig.excludePaths.some(pattern => { + const regex = new RegExp(pattern); + return regex.test(url); + }); + if (matches) return false; + } + + return true; + } catch (error) { + return false; + } + }); + + const uniqueUrls = Array.from(new Set(filteredUrls.map(url => { + return url.replace(/#.*$/, '').replace(/\/$/, ''); + }))); + + const basePathname = parsedBase.pathname; + const prioritizedUrls = uniqueUrls.sort((a, b) => { + try { + const aUrl = new URL(a); + const bUrl = new URL(b); + const aMatchesBase = aUrl.pathname.startsWith(basePathname); + const bMatchesBase = bUrl.pathname.startsWith(basePathname); + + if (aMatchesBase && !bMatchesBase) return -1; + if (!aMatchesBase && bMatchesBase) return 1; + + return 0; + } catch (error) { + return 0; + } + }); + + const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit); + + this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, Level.LOG); + + this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, Level.LOG); + const crawlResults = []; + + for (let i = 0; i < finalUrls.length; i++) { + const url = finalUrls[i]; + try { + this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, Level.LOG); + + await page.goto(url, { + waitUntil: 'domcontentloaded', + timeout: 30000 + }).catch(() => { + this.log(`Failed to navigate to ${url}, skipping...`, Level.WARN); + }); + + await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {}); + + const pageData = await page.evaluate(() => { + const getMeta = (name: string) => { + const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`); + return meta?.getAttribute('content') || ''; + }; + + const getAllMeta = () => { + const metadata: Record = {}; + const metaTags = document.querySelectorAll('meta'); + metaTags.forEach(tag => { + const name = tag.getAttribute('name') || tag.getAttribute('property'); + const content = tag.getAttribute('content'); + if (name && content) { + metadata[name] = content; + } + }); + return metadata; + }; + + const title = document.title || ''; + const bodyText = document.body?.innerText || ''; + + const elementsWithMxId = document.querySelectorAll('[data-mx-id]'); + elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id')); + + const html = document.documentElement.outerHTML; + const links = Array.from(document.querySelectorAll('a')).map(a => a.href); + const allMetadata = getAllMeta(); + + return { + title, + description: getMeta('description'), + text: bodyText, + html: html, + links: links, + wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length, + metadata: { + ...allMetadata, + title, + language: document.documentElement.lang || '', + favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '', + statusCode: 200 + } + }; + }); + + crawlResults.push({ + metadata: { + ...pageData.metadata, + url: url, + sourceURL: url + }, + html: pageData.html, + text: pageData.text, + links: pageData.links, + wordCount: pageData.wordCount, + scrapedAt: new Date().toISOString() + }); + + this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, Level.LOG); + + } catch (error) { + this.log(`Failed to scrape ${url}: ${error.message}`, Level.WARN); + crawlResults.push({ + url: url, + error: error.message, + scrapedAt: new Date().toISOString() + }); + } + } + + this.log(`Successfully scraped ${crawlResults.length} pages`, Level.LOG); + + const actionType = "crawl"; + const actionName = "Crawl Results"; + + if (!this.serializableDataByType[actionType]) { + this.serializableDataByType[actionType] = {}; + } + if (!this.serializableDataByType[actionType][actionName]) { + this.serializableDataByType[actionType][actionName] = []; + } + + this.serializableDataByType[actionType][actionName] = crawlResults; + + await this.options.serializableCallback({ + scrapeList: this.serializableDataByType.scrapeList || {}, + scrapeSchema: this.serializableDataByType.scrapeSchema || {}, + crawl: this.serializableDataByType.crawl || {}, + search: this.serializableDataByType.search || {} + }); + + } catch (error) { + this.log(`Crawl action failed: ${error.message}`, Level.ERROR); + throw new Error(`Crawl execution error: ${error.message}`); + } + }, + + search: async (searchConfig: { + query: string; + limit: number; + provider?: 'duckduckgo'; + filters?: { + timeRange?: 'day' | 'week' | 'month' | 'year'; + location?: string; + lang?: string; + }; + mode: 'discover' | 'scrape'; + }) => { + if (this.isAborted) { + this.log('Workflow aborted, stopping search', Level.WARN); + return; + } + + if (this.options.debugChannel?.setActionType) { + this.options.debugChannel.setActionType('search'); + } + + searchConfig.provider = 'duckduckgo'; + + this.log(`Performing DuckDuckGo search for: ${searchConfig.query}`, Level.LOG); + + try { + let searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(searchConfig.query)}`; + + if (searchConfig.filters?.timeRange) { + const timeMap: Record = { + 'day': 'd', + 'week': 'w', + 'month': 'm', + 'year': 'y' + }; + searchUrl += `&df=${timeMap[searchConfig.filters.timeRange]}`; + } + + const initialDelay = 500 + Math.random() * 1000; + await new Promise(resolve => setTimeout(resolve, initialDelay)); + + await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); + + await page.waitForLoadState('load', { timeout: 10000 }).catch(() => { + this.log('Load state timeout, continuing anyway', Level.WARN); + }); + + const pageLoadDelay = 2000 + Math.random() * 1500; + await new Promise(resolve => setTimeout(resolve, pageLoadDelay)); + + let searchResults: any[] = []; + let retryCount = 0; + const maxRetries = 2; + + while (searchResults.length === 0 && retryCount <= maxRetries) { + if (retryCount > 0) { + this.log(`Retry attempt ${retryCount}/${maxRetries} for DuckDuckGo search...`, Level.LOG); + const retryDelay = 1000 * Math.pow(2, retryCount) + Math.random() * 1000; + await new Promise(resolve => setTimeout(resolve, retryDelay)); + } + + this.log('Attempting to extract DuckDuckGo search results...', Level.LOG); + + await page.waitForSelector('[data-testid="result"], .result', { timeout: 5000 }).catch(() => { + this.log('DuckDuckGo results not found on initial wait', Level.WARN); + }); + + let currentResultCount = 0; + const maxLoadAttempts = Math.ceil(searchConfig.limit / 10) * 2; + let loadAttempts = 0; + let noNewResultsCount = 0; + + while (currentResultCount < searchConfig.limit && loadAttempts < maxLoadAttempts && noNewResultsCount < 3) { + const previousCount = currentResultCount; + + currentResultCount = await page.evaluate(() => { + const selectors = [ + '[data-testid="result"]', + 'article[data-testid="result"]', + 'li[data-layout="organic"]', + '.result', + 'article[data-testid]' + ]; + + for (const selector of selectors) { + const elements = document.querySelectorAll(selector); + if (elements.length > 0) { + return elements.length; + } + } + return 0; + }); + + if (currentResultCount >= searchConfig.limit) { + this.log(`Reached desired result count: ${currentResultCount}`, Level.LOG); + break; + } + + if (currentResultCount === previousCount) { + noNewResultsCount++; + this.log(`No new results after load more (attempt ${noNewResultsCount}/3)`, Level.WARN); + if (noNewResultsCount >= 3) break; + } else { + noNewResultsCount = 0; + this.log(`Current results count: ${currentResultCount}/${searchConfig.limit}`, Level.LOG); + } + + await page.evaluate(() => { + window.scrollTo(0, document.body.scrollHeight); + }); + + await new Promise(resolve => setTimeout(resolve, 800)); + + const loadMoreClicked = await page.evaluate(() => { + const selectors = [ + '#more-results', + 'button:has-text("More results")', + 'button:has-text("more results")', + 'button[id*="more"]', + 'button:has-text("Load more")' + ]; + + for (const selector of selectors) { + try { + const button = document.querySelector(selector) as HTMLButtonElement; + if (button && button.offsetParent !== null) { + button.click(); + console.log(`Clicked load more button with selector: ${selector}`); + return true; + } + } catch (e) { + continue; + } + } + return false; + }); + + if (loadMoreClicked) { + this.log('Clicked "More results" button', Level.LOG); + await new Promise(resolve => setTimeout(resolve, 1500 + Math.random() * 1000)); + } else { + this.log('No "More results" button found, results may be limited', Level.WARN); + break; + } + + loadAttempts++; + } + + this.log(`Finished pagination. Total results available: ${currentResultCount}`, Level.LOG); + + searchResults = await page.evaluate((limit: number) => { + const results: any[] = []; + + const cleanDescription = (text: string): string => { + if (!text) return ''; + let cleaned = text.replace(/^\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago\s*/i, ''); + cleaned = cleaned.replace(/^[A-Z][a-z]{2}\s+\d{1,2},?\s+\d{4}\s*[—\-]\s*/i, ''); + cleaned = cleaned.replace(/^\d{4}-\d{2}-\d{2}\s*[—\-]\s*/i, ''); + cleaned = cleaned.trim().replace(/\s+/g, ' '); + return cleaned; + }; + + const selectors = [ + '[data-testid="result"]', + 'article[data-testid="result"]', + 'li[data-layout="organic"]', + '.result', + 'article[data-testid]' + ]; + let allElements: Element[] = []; + + for (const selector of selectors) { + const elements = Array.from(document.querySelectorAll(selector)); + if (elements.length > 0) { + console.log(`Found ${elements.length} DDG elements with: ${selector}`); + allElements = elements; + break; + } + } + + for (let i = 0; i < Math.min(allElements.length, limit); i++) { + const element = allElements[i]; + + const titleEl = element.querySelector('h2, [data-testid="result-title-a"], h3, [data-testid="result-title"]'); + + let linkEl = titleEl?.querySelector('a[href]') as HTMLAnchorElement; + if (!linkEl) { + linkEl = element.querySelector('a[href]') as HTMLAnchorElement; + } + + if (!linkEl || !linkEl.href) continue; + + let actualUrl = linkEl.href; + + if (actualUrl.includes('uddg=')) { + try { + const urlParams = new URLSearchParams(actualUrl.split('?')[1]); + const uddgUrl = urlParams.get('uddg'); + if (uddgUrl) { + actualUrl = decodeURIComponent(uddgUrl); + } + } catch (e) { + console.log('Failed to parse uddg parameter:', e); + } + } + + if (actualUrl.includes('duckduckgo.com')) { + console.log(`Skipping DDG internal URL: ${actualUrl}`); + continue; + } + + const descEl = element.querySelector('[data-result="snippet"], .result__snippet, [data-testid="result-snippet"]'); + + if (titleEl && titleEl.textContent && actualUrl) { + const rawDescription = (descEl?.textContent || '').trim(); + const cleanedDescription = cleanDescription(rawDescription); + + results.push({ + url: actualUrl, + title: titleEl.textContent.trim(), + description: cleanedDescription, + position: results.length + 1 + }); + } + } + + console.log(`Extracted ${results.length} DuckDuckGo search results`); + return results; + }, searchConfig.limit); + + if (searchResults.length === 0) { + this.log(`No DuckDuckGo results found (attempt ${retryCount + 1}/${maxRetries + 1})`, Level.WARN); + retryCount++; + } else { + this.log(`Successfully extracted ${searchResults.length} results`, Level.LOG); + break; + } + } + + this.log(`Search found ${searchResults.length} results`, Level.LOG); + + if (searchConfig.mode === 'discover') { + const actionType = "search"; + const actionName = "Search Results"; + + if (!this.serializableDataByType[actionType]) { + this.serializableDataByType[actionType] = {}; + } + if (!this.serializableDataByType[actionType][actionName]) { + this.serializableDataByType[actionType][actionName] = {}; + } + + const searchData = { + query: searchConfig.query, + provider: searchConfig.provider, + filters: searchConfig.filters || {}, + resultsCount: searchResults.length, + results: searchResults, + searchedAt: new Date().toISOString() + }; + + this.serializableDataByType[actionType][actionName] = searchData; + + await this.options.serializableCallback({ + scrapeList: this.serializableDataByType.scrapeList || {}, + scrapeSchema: this.serializableDataByType.scrapeSchema || {}, + crawl: this.serializableDataByType.crawl || {}, + search: this.serializableDataByType.search || {} + }); + + this.log(`Search completed in discover mode with ${searchResults.length} results`, Level.LOG); + return; + } + + this.log(`Starting to scrape content from ${searchResults.length} search results...`, Level.LOG); + const scrapedResults = []; + + for (let i = 0; i < searchResults.length; i++) { + const result = searchResults[i]; + try { + this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, Level.LOG); + + await page.goto(result.url, { + waitUntil: 'domcontentloaded', + timeout: 30000 + }).catch(() => { + this.log(`Failed to navigate to ${result.url}, skipping...`, Level.WARN); + }); + + await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {}); + + const pageData = await page.evaluate(() => { + const getMeta = (name: string) => { + const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`); + return meta?.getAttribute('content') || ''; + }; + + const getAllMeta = () => { + const metadata: Record = {}; + const metaTags = document.querySelectorAll('meta'); + metaTags.forEach(tag => { + const name = tag.getAttribute('name') || tag.getAttribute('property'); + const content = tag.getAttribute('content'); + if (name && content) { + metadata[name] = content; + } + }); + return metadata; + }; + + const title = document.title || ''; + const bodyText = document.body?.innerText || ''; + + const elementsWithMxId = document.querySelectorAll('[data-mx-id]'); + elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id')); + + const html = document.documentElement.outerHTML; + const links = Array.from(document.querySelectorAll('a')).map(a => a.href); + const allMetadata = getAllMeta(); + + return { + title, + description: getMeta('description'), + text: bodyText, + html: html, + links: links, + wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length, + metadata: { + ...allMetadata, + title, + language: document.documentElement.lang || '', + favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '', + statusCode: 200 + } + }; + }); + + scrapedResults.push({ + searchResult: { + query: searchConfig.query, + position: result.position, + searchTitle: result.title, + searchDescription: result.description, + }, + metadata: { + ...pageData.metadata, + url: result.url, + sourceURL: result.url + }, + html: pageData.html, + text: pageData.text, + links: pageData.links, + wordCount: pageData.wordCount, + scrapedAt: new Date().toISOString() + }); + + this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, Level.LOG); + + } catch (error) { + this.log(`Failed to scrape ${result.url}: ${error.message}`, Level.WARN); + scrapedResults.push({ + searchResult: { + query: searchConfig.query, + position: result.position, + searchTitle: result.title, + searchDescription: result.description, + }, + url: result.url, + error: error.message, + scrapedAt: new Date().toISOString() + }); + } + } + + this.log(`Successfully scraped ${scrapedResults.length} search results`, Level.LOG); + + const actionType = "search"; + const actionName = "Search Results"; + + if (!this.serializableDataByType[actionType]) { + this.serializableDataByType[actionType] = {}; + } + if (!this.serializableDataByType[actionType][actionName]) { + this.serializableDataByType[actionType][actionName] = {}; + } + + const searchData = { + query: searchConfig.query, + provider: searchConfig.provider, + filters: searchConfig.filters || {}, + mode: searchConfig.mode, + resultsCount: scrapedResults.length, + results: scrapedResults, + searchedAt: new Date().toISOString() + }; + + this.serializableDataByType[actionType][actionName] = searchData; + + await this.options.serializableCallback({ + scrapeList: this.serializableDataByType.scrapeList || {}, + scrapeSchema: this.serializableDataByType.scrapeSchema || {}, + crawl: this.serializableDataByType.crawl || {}, + search: this.serializableDataByType.search || {} + }); + + } catch (error) { + this.log(`Search action failed: ${error.message}`, Level.ERROR); + throw new Error(`Search execution error: ${error.message}`); + } + }, + flag: async () => new Promise((res) => { if (this.options.debugChannel?.setActionType) { this.options.debugChannel.setActionType('flag'); @@ -890,7 +1638,9 @@ export default class Interpreter extends EventEmitter { this.serializableDataByType[actionType][actionName] = [...allResults]; await this.options.serializableCallback({ scrapeList: this.serializableDataByType.scrapeList, - scrapeSchema: this.serializableDataByType.scrapeSchema + scrapeSchema: this.serializableDataByType.scrapeSchema, + crawl: this.serializableDataByType.crawl || {}, + search: this.serializableDataByType.search || {} }); }; @@ -1758,7 +2508,7 @@ export default class Interpreter extends EventEmitter { // Clear accumulated data to free memory this.cumulativeResults = []; this.namedResults = {}; - this.serializableDataByType = { scrapeList: {}, scrapeSchema: {} }; + this.serializableDataByType = { scrapeList: {}, scrapeSchema: {}, crawl: {}, search: {} }; // Reset state this.isAborted = false; diff --git a/maxun-core/src/types/workflow.ts b/maxun-core/src/types/workflow.ts index 91278009..54c5892e 100644 --- a/maxun-core/src/types/workflow.ts +++ b/maxun-core/src/types/workflow.ts @@ -28,7 +28,7 @@ type MethodNames = { [K in keyof T]: T[K] extends Function ? K : never; }[keyof T]; -export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto'; +export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto' | 'crawl' | 'search'; export type What = { action: MethodNames | CustomFunctions, diff --git a/server/src/api/record.ts b/server/src/api/record.ts index f6ef6ea7..572bc2bc 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -13,8 +13,8 @@ import { AuthenticatedRequest } from "../routes/record" import {capture} from "../utils/analytics"; import { Page } from "playwright-core"; import { WorkflowFile } from "maxun-core"; -import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet"; -import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable"; +import { addGoogleSheetUpdateTask, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet"; +import { addAirtableUpdateTask, processAirtableUpdates } from "../workflow-management/integrations/airtable"; import { sendWebhook } from "../routes/webhook"; import { convertPageToHTML, convertPageToMarkdown, convertPageToScreenshot } from '../markdownify/scrape'; @@ -309,8 +309,8 @@ router.get("/robots/:id/runs",requireAPIKey, async (req: Request, res: Response) statusCode: 200, messageCode: "success", runs: { - totalCount: formattedRuns.length, - items: formattedRuns, + totalCount: formattedRuns.length, + items: formattedRuns, }, }; @@ -342,6 +342,8 @@ function formatRunResponse(run: any) { data: { textData: {}, listData: {}, + crawlData: {}, + searchData: {}, markdown: '', html: '' }, @@ -358,6 +360,14 @@ function formatRunResponse(run: any) { formattedRun.data.listData = output.scrapeList; } + if (output.crawl && typeof output.crawl === 'object') { + formattedRun.data.crawlData = output.crawl; + } + + if (output.search && typeof output.search === 'object') { + formattedRun.data.searchData = output.search; + } + if (output.markdown && Array.isArray(output.markdown)) { formattedRun.data.markdown = output.markdown[0]?.content || ''; } @@ -466,7 +476,7 @@ router.get("/robots/:id/runs/:runId", requireAPIKey, async (req: Request, res: R } }); -async function createWorkflowAndStoreMetadata(id: string, userId: string) { +async function createWorkflowAndStoreMetadata(id: string, userId: string, isSDK: boolean) { try { const recording = await Robot.findOne({ where: { @@ -510,7 +520,9 @@ async function createWorkflowAndStoreMetadata(id: string, userId: string) { interpreterSettings: { maxConcurrency: 1, maxRepeats: 1, debug: true }, log: '', runId, - runByAPI: true, + runByUserId: userId, + runByAPI: !isSDK, + runBySDK: isSDK, serializableOutput: {}, binaryOutput: {}, retryCount: 0 @@ -687,7 +699,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ let formats = recording.recording_meta.formats || ['markdown']; - // Override if API request defines formats if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) { formats = requestedFormats.filter((f): f is 'markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage' => ['markdown', 'html', 'screenshot-visible', 'screenshot-fullpage'].includes(f) @@ -714,50 +725,70 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ const SCRAPE_TIMEOUT = 120000; if (formats.includes('markdown')) { - const markdownPromise = convertPageToMarkdown(url, currentPage); - const timeoutPromise = new Promise((_, reject) => { - setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT); - }); - markdown = await Promise.race([markdownPromise, timeoutPromise]); - serializableOutput.markdown = [{ content: markdown }]; + try { + const markdownPromise = convertPageToMarkdown(url, currentPage); + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT); + }); + markdown = await Promise.race([markdownPromise, timeoutPromise]); + if (markdown && markdown.trim().length > 0) { + serializableOutput.markdown = [{ content: markdown }]; + } + } catch (error: any) { + logger.log('warn', `Markdown conversion failed for API run ${plainRun.runId}: ${error.message}`); + } } if (formats.includes('html')) { - const htmlPromise = convertPageToHTML(url, currentPage); - const timeoutPromise = new Promise((_, reject) => { - setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT); - }); - html = await Promise.race([htmlPromise, timeoutPromise]); - serializableOutput.html = [{ content: html }]; + try { + const htmlPromise = convertPageToHTML(url, currentPage); + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT); + }); + html = await Promise.race([htmlPromise, timeoutPromise]); + if (html && html.trim().length > 0) { + serializableOutput.html = [{ content: html }]; + } + } catch (error: any) { + logger.log('warn', `HTML conversion failed for API run ${plainRun.runId}: ${error.message}`); + } } if (formats.includes("screenshot-visible")) { - const screenshotPromise = convertPageToScreenshot(url, currentPage, false); - const timeoutPromise = new Promise((_, reject) => { - setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT); - }); - const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]); + try { + const screenshotPromise = convertPageToScreenshot(url, currentPage, false); + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT); + }); + const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]); - if (!binaryOutput['screenshot-visible']) { - binaryOutput['screenshot-visible'] = { - data: screenshotBuffer.toString('base64'), - mimeType: 'image/png' - }; + if (screenshotBuffer && screenshotBuffer.length > 0) { + binaryOutput['screenshot-visible'] = { + data: screenshotBuffer.toString('base64'), + mimeType: 'image/png' + }; + } + } catch (error: any) { + logger.log('warn', `Screenshot-visible conversion failed for API run ${plainRun.runId}: ${error.message}`); } } if (formats.includes("screenshot-fullpage")) { - const screenshotPromise = convertPageToScreenshot(url, currentPage, true); - const timeoutPromise = new Promise((_, reject) => { - setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT); - }); - const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]); + try { + const screenshotPromise = convertPageToScreenshot(url, currentPage, true); + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT); + }); + const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]); - if (!binaryOutput['screenshot-fullpage']) { - binaryOutput['screenshot-fullpage'] = { - data: screenshotBuffer.toString('base64'), - mimeType: 'image/png' - }; + if (screenshotBuffer && screenshotBuffer.length > 0) { + binaryOutput['screenshot-fullpage'] = { + data: screenshotBuffer.toString('base64'), + mimeType: 'image/png' + }; + } + } catch (error: any) { + logger.log('warn', `Screenshot-fullpage conversion failed for API run ${plainRun.runId}: ${error.message}`); } } @@ -769,7 +800,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ binaryOutput, }); - // Upload binary output (screenshots) to MinIO if present let uploadedBinaryOutput: Record = {}; if (Object.keys(binaryOutput).length > 0) { const binaryOutputService = new BinaryOutputService('maxun-run-screenshots'); @@ -779,7 +809,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ logger.log('info', `Markdown robot execution completed for API run ${id}`); - // Push success socket event try { const completionData = { runId: plainRun.runId, @@ -800,7 +829,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ ); } - // Build webhook payload const webhookPayload: any = { robot_id: plainRun.robotMetaId, run_id: plainRun.runId, @@ -814,8 +842,8 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ }, }; - if (formats.includes('markdown')) webhookPayload.markdown = markdown; - if (formats.includes('html')) webhookPayload.html = html; + if (serializableOutput.markdown) webhookPayload.markdown = markdown; + if (serializableOutput.html) webhookPayload.html = html; if (uploadedBinaryOutput['screenshot-visible']) webhookPayload.screenshot_visible = uploadedBinaryOutput['screenshot-visible']; if (uploadedBinaryOutput['screenshot-fullpage']) webhookPayload.screenshot_fullpage = uploadedBinaryOutput['screenshot-fullpage']; @@ -834,9 +862,12 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ capture("maxun-oss-run-created-api", { runId: plainRun.runId, - user_id: userId, + userId: userId, + robotId: recording.recording_meta.id, + robotType: "scrape", + source: "api", status: "success", - robot_type: "scrape", + createdAt: new Date().toISOString(), formats }); @@ -858,14 +889,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ log: `${formats.join(', ')} conversion failed: ${error.message}`, }); - // Send failure socket event try { const failureData = { runId: plainRun.runId, robotMetaId: plainRun.robotMetaId, robotName: recording.recording_meta.name, status: 'failed', - finishedAt: new Date().toLocaleString() + finishedAt: new Date().toLocaleString(), + error: error.message }; serverIo @@ -895,11 +926,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ logger.log('warn', `Failed to send webhook for failed API scrape run ${plainRun.runId}: ${webhookError.message}`); } - capture("maxun-oss-run-created-api", { + capture("maxun-oss-run-created", { runId: plainRun.runId, - user_id: userId, + userId: userId, + robotId: recording.recording_meta.id, + robotType: "scrape", + source: "api", status: "failed", - robot_type: "scrape", + createdAt: new Date().toISOString(), formats }); @@ -993,15 +1027,18 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ const totalRowsExtracted = totalSchemaItemsExtracted + totalListItemsExtracted; - capture('maxun-oss-run-created-api',{ + capture('maxun-oss-run-created',{ runId: id, - created_at: new Date().toISOString(), + userId: userId, + robotId: recording.recording_meta.id, + robotType: recording.recording_meta.type || 'extract', + source: 'api', + createdAt: new Date().toISOString(), status: 'success', - totalRowsExtracted, - schemaItemsExtracted: totalSchemaItemsExtracted, - listItemsExtracted: totalListItemsExtracted, + totalSchemaItemsExtracted, + totalListItemsExtracted, extractedScreenshotsCount, - is_llm: (recording.recording_meta as any).isLLM, + totalRowsExtracted } ) @@ -1019,6 +1056,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ typeof parsedOutput.scrapeSchema === "string" ? JSON.parse(parsedOutput.scrapeSchema) : parsedOutput.scrapeSchema || {}; + + const parsedCrawl = + typeof parsedOutput.crawl === "string" + ? JSON.parse(parsedOutput.crawl) + : parsedOutput.crawl || {}; + + const parsedSearch = + typeof parsedOutput.search === "string" + ? JSON.parse(parsedOutput.search) + : parsedOutput.search || {}; const webhookPayload = { robot_id: plainRun.robotMetaId, @@ -1030,6 +1077,8 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ extracted_data: { captured_texts: parsedSchema || {}, captured_lists: parsedList || {}, + crawl_data: parsedCrawl || {}, + search_data: parsedSearch || {}, captured_texts_count: totalSchemaItemsExtracted, captured_lists_count: totalListItemsExtracted, screenshots_count: extractedScreenshotsCount @@ -1097,7 +1146,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ const recording = await Robot.findOne({ where: { 'recording_meta.id': run.robotMetaId }, raw: true }); - // Trigger webhooks for run failure const failedWebhookPayload = { robot_id: run.robotMetaId, run_id: run.runId, @@ -1123,10 +1171,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ logger.log('error', `Failed to send failure webhooks for run ${run.runId}: ${webhookError.message}`); } capture( - 'maxun-oss-run-created-api', + 'maxun-oss-run-created', { runId: id, - created_at: new Date().toISOString(), + userId: userId, + robotId: recording?.recording_meta?.id || run.robotMetaId, + robotType: recording?.recording_meta?.type || 'extract', + source: 'api', + createdAt: new Date().toISOString(), status: 'failed', is_llm: (recording?.recording_meta as any)?.isLLM, } @@ -1139,11 +1191,11 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ } } -export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) { +export async function handleRunRecording(id: string, userId: string, isSDK: boolean = false) { let socket: Socket | null = null; try { - const result = await createWorkflowAndStoreMetadata(id, userId); + const result = await createWorkflowAndStoreMetadata(id, userId, isSDK); const { browserId, runId: newRunId } = result; if (!browserId || !newRunId || !userId) { @@ -1167,6 +1219,10 @@ export async function handleRunRecording(id: string, userId: string, requestedFo cleanupSocketConnection(socket!, browserId, newRunId); }); + socket.on('error', (error: Error) => { + logger.error(`Socket error for API run ${newRunId}: ${error.message}`); + }); + socket.on('disconnect', () => { cleanupSocketConnection(socket!, browserId, newRunId); }); @@ -1318,9 +1374,7 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest, return res.status(401).json({ ok: false, error: 'Unauthorized' }); } - const requestedFormats = req.body.formats; - - const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats); + const runId = await handleRunRecording(req.params.id, req.user.id); if (!runId) { throw new Error('Run ID is undefined'); diff --git a/server/src/api/sdk.ts b/server/src/api/sdk.ts index 4eb64208..dabd309c 100644 --- a/server/src/api/sdk.ts +++ b/server/src/api/sdk.ts @@ -455,13 +455,35 @@ router.post("/sdk/robots/:id/execute", requireAPIKey, async (req: AuthenticatedR } } + let crawlData: any[] = []; + if (run.serializableOutput?.crawl) { + const crawl: any = run.serializableOutput.crawl; + + if (Array.isArray(crawl)) { + crawlData = crawl; + } + else if (typeof crawl === 'object') { + const crawlValues = Object.values(crawl); + if (crawlValues.length > 0 && Array.isArray(crawlValues[0])) { + crawlData = crawlValues[0] as any[]; + } + } + } + + let searchData: any = {}; + if (run.serializableOutput?.search) { + searchData = run.serializableOutput.search; + } + return res.status(200).json({ data: { runId: run.runId, status: run.status, data: { textData: run.serializableOutput?.scrapeSchema || {}, - listData: listData + listData: listData, + crawlData: crawlData, + searchData: searchData }, screenshots: Object.values(run.binaryOutput || {}) } @@ -645,6 +667,202 @@ router.post("/sdk/robots/:id/runs/:runId/abort", requireAPIKey, async (req: Auth } }); +/** + * Create a crawl robot programmatically + * POST /api/sdk/crawl + */ +router.post("/sdk/crawl", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const user = req.user; + const { url, name, crawlConfig } = req.body; + + if (!url || !crawlConfig) { + return res.status(400).json({ + error: "URL and crawl configuration are required" + }); + } + + try { + new URL(url); + } catch (err) { + return res.status(400).json({ + error: "Invalid URL format" + }); + } + + if (typeof crawlConfig !== 'object') { + return res.status(400).json({ + error: "crawlConfig must be an object" + }); + } + + const robotName = name || `Crawl Robot - ${new URL(url).hostname}`; + const robotId = uuid(); + const metaId = uuid(); + + const robot = await Robot.create({ + id: robotId, + userId: user.id, + recording_meta: { + name: robotName, + id: metaId, + createdAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + pairs: 1, + params: [], + type: 'crawl', + url: url, + }, + recording: { + workflow: [ + { + where: { url }, + what: [ + { action: 'flag', args: ['generated'] }, + { + action: 'crawl', + args: [crawlConfig], + name: 'Crawl' + } + ] + }, + { + where: { url: 'about:blank' }, + what: [ + { + action: 'goto', + args: [url] + }, + { + action: 'waitForLoadState', + args: ['networkidle'] + } + ] + } + ] + } + }); + + logger.info(`[SDK] Crawl robot created: ${metaId} (db: ${robotId}) by user ${user.id}`); + + capture("maxun-oss-robot-created", { + userId: user.id.toString(), + robotId: metaId, + robotName: robotName, + url: url, + robotType: 'crawl', + crawlConfig: crawlConfig, + source: 'sdk' + }); + + return res.status(201).json({ + data: robot, + message: "Crawl robot created successfully" + }); + + } catch (error: any) { + logger.error("[SDK] Error creating crawl robot:", error); + return res.status(500).json({ + error: "Failed to create crawl robot", + message: error.message + }); + } +}); + +/** + * Create a search robot programmatically + * POST /api/sdk/search + */ +router.post("/sdk/search", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const user = req.user; + const { name, searchConfig } = req.body; + + if (!searchConfig) { + return res.status(400).json({ + error: "Search configuration is required" + }); + } + + if (!searchConfig.query) { + return res.status(400).json({ + error: "searchConfig must include a query" + }); + } + + if (typeof searchConfig !== 'object') { + return res.status(400).json({ + error: "searchConfig must be an object" + }); + } + + if (searchConfig.mode && !['discover', 'scrape'].includes(searchConfig.mode)) { + return res.status(400).json({ + error: "searchConfig.mode must be either 'discover' or 'scrape'" + }); + } + + searchConfig.provider = 'duckduckgo'; + + const robotName = name || `Search Robot - ${searchConfig.query}`; + const robotId = uuid(); + const metaId = uuid(); + + const robot = await Robot.create({ + id: robotId, + userId: user.id, + recording_meta: { + name: robotName, + id: metaId, + createdAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + pairs: 1, + params: [], + type: 'search', + }, + recording: { + workflow: [ + { + where: { url: 'about:blank' }, + what: [ + { + action: 'search', + args: [searchConfig], + name: 'Search' + } + ] + } + ] + } + }); + + logger.info(`[SDK] Search robot created: ${metaId} (db: ${robotId}) by user ${user.id}`); + + capture("maxun-oss-robot-created", { + userId: user.id.toString(), + robotId: metaId, + robotName: robotName, + robotType: 'search', + searchQuery: searchConfig.query, + searchProvider: searchConfig.provider || 'duckduckgo', + searchLimit: searchConfig.limit || 10, + source: 'sdk' + }); + + return res.status(201).json({ + data: robot, + message: "Search robot created successfully" + }); + + } catch (error: any) { + logger.error("[SDK] Error creating search robot:", error); + return res.status(500).json({ + error: "Failed to create search robot", + message: error.message + }); + } +}); + /** * LLM-based extraction - generate workflow from natural language prompt * POST /api/sdk/extract/llm diff --git a/server/src/models/Robot.ts b/server/src/models/Robot.ts index 51f9e48f..d45ac502 100644 --- a/server/src/models/Robot.ts +++ b/server/src/models/Robot.ts @@ -9,7 +9,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'extract' | 'scrape'; + type?: 'extract' | 'scrape' | 'crawl' | 'search'; url?: string; formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[]; isLLM?: boolean; diff --git a/server/src/models/Run.ts b/server/src/models/Run.ts index 6f560f48..0b25005d 100644 --- a/server/src/models/Run.ts +++ b/server/src/models/Run.ts @@ -23,6 +23,7 @@ interface RunAttributes { runByUserId?: string; runByScheduleId?: string; runByAPI?: boolean; + runBySDK?: boolean; serializableOutput: Record; binaryOutput: Record; retryCount?: number; diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts index 43b4efc3..fb91a043 100644 --- a/server/src/pgboss-worker.ts +++ b/server/src/pgboss-worker.ts @@ -132,7 +132,6 @@ async function processRunExecution(job: Job) { logger.log('info', `Processing run execution job for runId: ${data.runId}, browserId: ${data.browserId}`); try { - // Find the run const run = await Run.findOne({ where: { runId: data.runId } }); if (!run) { logger.log('error', `Run ${data.runId} not found in database`); @@ -193,7 +192,6 @@ async function processRunExecution(job: Job) { logger.log('info', `Browser ${browserId} found and ready for execution`); try { - // Find the recording const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true }); if (!recording) { @@ -473,11 +471,12 @@ async function processRunExecution(job: Job) { interpretationInfo.binaryOutput ); - // Get the already persisted and credit-validated data from the run record const finalRun = await Run.findByPk(run.id); const categorizedOutput = { scrapeSchema: finalRun?.serializableOutput?.scrapeSchema || {}, - scrapeList: finalRun?.serializableOutput?.scrapeList || {} + scrapeList: finalRun?.serializableOutput?.scrapeList || {}, + crawl: finalRun?.serializableOutput?.crawl || {}, + search: finalRun?.serializableOutput?.search || {} }; if (await isRunAborted()) { @@ -489,10 +488,6 @@ async function processRunExecution(job: Job) { status: 'success', finishedAt: new Date().toLocaleString(), log: interpretationInfo.log.join('\n'), - serializableOutput: JSON.parse(JSON.stringify({ - scrapeSchema: categorizedOutput.scrapeSchema || {}, - scrapeList: categorizedOutput.scrapeList || {}, - })), binaryOutput: uploadedBinaryOutput, }); @@ -572,6 +567,8 @@ async function processRunExecution(job: Job) { }, {} as Record) : {}, captured_lists: categorizedOutput.scrapeList, + crawl_data: categorizedOutput.crawl, + search_data: categorizedOutput.search, captured_texts_count: totalSchemaItemsExtracted, captured_lists_count: totalListItemsExtracted, screenshots_count: extractedScreenshotsCount diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index 896baf04..277c95d8 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -251,21 +251,18 @@ function handleWorkflowActions(workflow: any[], credentials: Credentials) { router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, res) => { try { const { id } = req.params; - const { name, limits, credentials, targetUrl, workflow: incomingWorkflow } = req.body; + const { name, limits, credentials, targetUrl, workflow: incomingWorkflow } = req.body; - // Validate input - if (!name && !limits && !credentials && !targetUrl) { + if (!name && !limits && !credentials && !targetUrl && !incomingWorkflow) { return res.status(400).json({ error: 'Either "name", "limits", "credentials" or "target_url" must be provided.' }); } - // Fetch the robot by ID const robot = await Robot.findOne({ where: { 'recording_meta.id': id } }); - if (!robot) { return res.status(404).json({ error: 'Robot not found.' }); } - // Update fields if provided + if (name) { robot.set('recording_meta', { ...robot.recording_meta, name }); } @@ -274,7 +271,6 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl }); const updatedWorkflow = [...robot.recording.workflow]; - let foundGoto = false; for (let i = updatedWorkflow.length - 1; i >= 0; i--) { const step = updatedWorkflow[i]; @@ -289,7 +285,6 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r robot.set('recording', { ...robot.recording, workflow: updatedWorkflow }); robot.changed('recording', true); - foundGoto = true; i = -1; break; } @@ -299,10 +294,9 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r await robot.save(); - // Start with existing workflow or allow client to supply a full workflow replacement let workflow = incomingWorkflow && Array.isArray(incomingWorkflow) ? JSON.parse(JSON.stringify(incomingWorkflow)) - : [...robot.recording.workflow]; // Create a copy of the workflow + : [...robot.recording.workflow]; if (credentials) { workflow = handleWorkflowActions(workflow, credentials); @@ -344,7 +338,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r where: { 'recording_meta.id': id } }); - const updatedRobot = await Robot.findOne({ where: { 'recording_meta.id': id } }); + await Robot.findOne({ where: { 'recording_meta.id': id } }); logger.log('info', `Robot with ID ${id} was updated successfully.`); @@ -1323,4 +1317,198 @@ export async function recoverOrphanedRuns() { } } +/** + * POST endpoint for creating a crawl robot + * @route POST /recordings/crawl + * @auth requireSignIn - JWT authentication required + */ +router.post('/recordings/crawl', requireSignIn, async (req: AuthenticatedRequest, res) => { + try { + const { url, name, crawlConfig } = req.body; + + if (!url || !crawlConfig) { + return res.status(400).json({ error: 'URL and crawl configuration are required.' }); + } + + if (!req.user) { + return res.status(401).send({ error: 'Unauthorized' }); + } + + try { + new URL(url); + } catch (err) { + return res.status(400).json({ error: 'Invalid URL format' }); + } + + const robotName = name || `Crawl Robot - ${new URL(url).hostname}`; + const currentTimestamp = new Date().toLocaleString('en-US'); + const robotId = uuid(); + + const newRobot = await Robot.create({ + id: uuid(), + userId: req.user.id, + recording_meta: { + name: robotName, + id: robotId, + createdAt: currentTimestamp, + updatedAt: currentTimestamp, + pairs: 1, + params: [], + type: 'crawl', + url: url, + }, + recording: { + workflow: [ + { + where: { url }, + what: [ + { action: 'flag', args: ['generated'] }, + { + action: 'crawl', + args: [crawlConfig], + name: 'Crawl' + } + ] + }, + { + where: { url: 'about:blank' }, + what: [ + { + action: 'goto', + args: [url] + }, + { + action: 'waitForLoadState', + args: ['networkidle'] + } + ] + } + ] + }, + google_sheet_email: null, + google_sheet_name: null, + google_sheet_id: null, + google_access_token: null, + google_refresh_token: null, + airtable_base_id: null, + airtable_base_name: null, + airtable_table_name: null, + airtable_table_id: null, + airtable_access_token: null, + airtable_refresh_token: null, + schedule: null, + webhooks: null + }); + + logger.log('info', `Crawl robot created with id: ${newRobot.id}`); + capture('maxun-oss-robot-created', { + userId: req.user.id.toString(), + robotId: robotId, + robotName: robotName, + url: url, + robotType: 'crawl', + crawlConfig: crawlConfig + }); + + return res.status(201).json({ + message: 'Crawl robot created successfully.', + robot: newRobot, + }); + } catch (error) { + if (error instanceof Error) { + logger.log('error', `Error creating crawl robot: ${error.message}`); + return res.status(500).json({ error: error.message }); + } else { + logger.log('error', 'Unknown error creating crawl robot'); + return res.status(500).json({ error: 'An unknown error occurred.' }); + } + } +}); + +/** + * POST endpoint for creating a search robot + * @route POST /recordings/search + * @auth requireSignIn - JWT authentication required + */ +router.post('/recordings/search', requireSignIn, async (req: AuthenticatedRequest, res) => { + try { + const { searchConfig, name } = req.body; + + if (!searchConfig || !searchConfig.query) { + return res.status(400).json({ error: 'Search configuration with query is required.' }); + } + + if (!req.user) { + return res.status(401).send({ error: 'Unauthorized' }); + } + + const robotName = name || `Search Robot - ${searchConfig.query.substring(0, 50)}`; + const currentTimestamp = new Date().toLocaleString('en-US'); + const robotId = uuid(); + + const newRobot = await Robot.create({ + id: uuid(), + userId: req.user.id, + recording_meta: { + name: robotName, + id: robotId, + createdAt: currentTimestamp, + updatedAt: currentTimestamp, + pairs: 1, + params: [], + type: 'search', + }, + recording: { + workflow: [ + { + where: { url: 'about:blank' }, + what: [{ + action: 'search', + args: [searchConfig], + name: 'Search' + }] + } + ] + }, + google_sheet_email: null, + google_sheet_name: null, + google_sheet_id: null, + google_access_token: null, + google_refresh_token: null, + airtable_base_id: null, + airtable_base_name: null, + airtable_table_name: null, + airtable_table_id: null, + airtable_access_token: null, + airtable_refresh_token: null, + schedule: null, + webhooks: null + }); + + logger.log('info', `Search robot created with id: ${newRobot.id}`); + capture('maxun-oss-robot-created', { + userId: req.user.id.toString(), + robotId: robotId, + robotName: robotName, + robotType: 'search', + searchQuery: searchConfig.query, + searchProvider: searchConfig.provider || 'duckduckgo', + searchLimit: searchConfig.limit || 10 + }); + + return res.status(201).json({ + message: 'Search robot created successfully.', + robot: newRobot, + }); + } catch (error) { + if (error instanceof Error) { + logger.log('error', `Error creating search robot: ${error.message}`); + return res.status(500).json({ error: error.message }); + } else { + logger.log('error', 'Unknown error creating search robot'); + return res.status(500).json({ error: 'An unknown error occurred.' }); + } + } +}); + export { processQueuedRuns }; \ No newline at end of file diff --git a/server/src/workflow-management/classes/Interpreter.ts b/server/src/workflow-management/classes/Interpreter.ts index 1bd6efe5..af4af624 100644 --- a/server/src/workflow-management/classes/Interpreter.ts +++ b/server/src/workflow-management/classes/Interpreter.ts @@ -16,7 +16,6 @@ function processWorkflow(workflow: WorkflowFile, checkLimit: boolean = false): W processedWorkflow.workflow.forEach((pair) => { pair.what.forEach((action) => { - // Handle limit validation for scrapeList action if (action.action === 'scrapeList' && checkLimit && Array.isArray(action.args) && action.args.length > 0) { const scrapeConfig = action.args[0]; if (scrapeConfig && typeof scrapeConfig === 'object' && 'limit' in scrapeConfig) { @@ -26,7 +25,6 @@ function processWorkflow(workflow: WorkflowFile, checkLimit: boolean = false): W } } - // Handle decryption for type and press actions if ((action.action === 'type' || action.action === 'press') && Array.isArray(action.args) && action.args.length > 1) { try { const encryptedValue = action.args[1]; @@ -93,10 +91,14 @@ export class WorkflowInterpreter { public serializableDataByType: { scrapeSchema: Record; scrapeList: Record; + crawl: Record; + search: Record; [key: string]: any; } = { scrapeSchema: {}, scrapeList: {}, + crawl: {}, + search: {}, }; private currentActionName: string | null = null; @@ -282,7 +284,6 @@ export class WorkflowInterpreter { } } else if (this.currentActionType === 'scrapeList') { if (data && Array.isArray(data) && data.length > 0) { - // Use the current index for persistence await this.persistDataToDatabase('scrapeList', data, this.currentScrapeListIndex); } @@ -293,7 +294,6 @@ export class WorkflowInterpreter { } }, binaryCallback: async (data: string, mimetype: string) => { - // For editor mode, we don't have the name yet, so use a timestamp-based name const binaryItem = { name: `Screenshot ${Date.now()}`, mimeType: mimetype, @@ -301,7 +301,6 @@ export class WorkflowInterpreter { }; this.binaryData.push(binaryItem); - // Persist binary data to database await this.persistBinaryDataToDatabase(binaryItem); this.socket.emit('binaryCallback', { @@ -340,7 +339,6 @@ export class WorkflowInterpreter { logger.log('debug', `Interpretation finished`); - // Flush any remaining data in persistence buffer before completing await this.flushPersistenceBuffer(); this.interpreter = null; @@ -419,6 +417,8 @@ export class WorkflowInterpreter { this.serializableDataByType = { scrapeSchema: {}, scrapeList: {}, + crawl: {}, + search: {}, }; this.binaryData = []; this.currentScrapeListIndex = 0; @@ -598,12 +598,20 @@ export class WorkflowInterpreter { typeKey = "scrapeList"; } else if (this.currentActionType === "scrapeSchema") { typeKey = "scrapeSchema"; + } else if (this.currentActionType === "crawl") { + typeKey = "crawl"; + } else if (this.currentActionType === "search") { + typeKey = "search"; } if (typeKey === "scrapeList" && data.scrapeList) { data = data.scrapeList; } else if (typeKey === "scrapeSchema" && data.scrapeSchema) { data = data.scrapeSchema; + } else if (typeKey === "crawl" && data.crawl) { + data = data.crawl; + } else if (typeKey === "search" && data.search) { + data = data.search; } let actionName = ""; @@ -616,38 +624,65 @@ export class WorkflowInterpreter { actionName = keys[keys.length - 1]; data = data[actionName]; } + } else if (typeKey === "crawl" && data && typeof data === "object" && !Array.isArray(data)) { + const keys = Object.keys(data); + if (keys.length === 1) { + actionName = keys[0]; + data = data[actionName]; + } else if (keys.length > 1) { + actionName = keys[keys.length - 1]; + data = data[actionName]; + } + } else if (typeKey === "search" && data && typeof data === "object" && !Array.isArray(data)) { + const keys = Object.keys(data); + if (keys.length === 1) { + actionName = keys[0]; + data = data[actionName]; + } else if (keys.length > 1) { + actionName = keys[keys.length - 1]; + data = data[actionName]; + } } if (!actionName) { actionName = this.currentActionName || ""; if (typeKey === "scrapeList" && !actionName) { actionName = this.getUniqueActionName(typeKey, ""); + } else if (typeKey === "crawl" && !actionName) { + actionName = this.getUniqueActionName(typeKey, "Crawl Results"); + } else if (typeKey === "search" && !actionName) { + actionName = this.getUniqueActionName(typeKey, "Search Results"); } } - const flattened = Array.isArray(data) - ? data - : ( - data?.List ?? - (data && typeof data === "object" - ? Object.values(data).flat?.() ?? data - : []) - ); + let processedData; + if (typeKey === "search") { + processedData = data; + } else { + processedData = Array.isArray(data) + ? data + : ( + data?.List ?? + (data && typeof data === "object" + ? Object.values(data).flat?.() ?? data + : []) + ); + } if (!this.serializableDataByType[typeKey]) { this.serializableDataByType[typeKey] = {}; } - this.serializableDataByType[typeKey][actionName] = flattened; + this.serializableDataByType[typeKey][actionName] = processedData; await this.persistDataToDatabase(typeKey, { - [actionName]: flattened, + [actionName]: processedData, }); this.socket.emit("serializableCallback", { type: typeKey, name: actionName, - data: flattened, + data: processedData, }); } catch (err: any) { logger.log('error', `serializableCallback handler failed: ${err.message}`); @@ -705,7 +740,6 @@ export class WorkflowInterpreter { await this.flushPersistenceBuffer(); - // Structure the output to maintain separate data for each action type const result = { log: this.debugMessages, result: status, @@ -801,7 +835,7 @@ export class WorkflowInterpreter { const currentSerializableOutput = run.serializableOutput ? JSON.parse(JSON.stringify(run.serializableOutput)) : - { scrapeSchema: [], scrapeList: [] }; + { scrapeSchema: {}, scrapeList: {}, crawl: {}, search: {} }; if (Array.isArray(currentSerializableOutput.scrapeList)) { currentSerializableOutput.scrapeList = {}; @@ -809,6 +843,9 @@ export class WorkflowInterpreter { if (Array.isArray(currentSerializableOutput.scrapeSchema)) { currentSerializableOutput.scrapeSchema = {}; } + if (!currentSerializableOutput.search) { + currentSerializableOutput.search = {}; + } let hasUpdates = false; @@ -834,6 +871,18 @@ export class WorkflowInterpreter { } mergeLists(currentSerializableOutput.scrapeList, item.data); hasUpdates = true; + } else if (item.actionType === 'crawl') { + currentSerializableOutput.crawl = { + ...(currentSerializableOutput.crawl || {}), + ...item.data + }; + hasUpdates = true; + } else if (item.actionType === 'search') { + currentSerializableOutput.search = { + ...(currentSerializableOutput.search || {}), + ...item.data + }; + hasUpdates = true; } } diff --git a/server/src/workflow-management/integrations/airtable.ts b/server/src/workflow-management/integrations/airtable.ts index 788cb60b..a84d83db 100644 --- a/server/src/workflow-management/integrations/airtable.ts +++ b/server/src/workflow-management/integrations/airtable.ts @@ -13,7 +13,11 @@ interface AirtableUpdateTask { interface SerializableOutput { scrapeSchema?: Record; - scrapeList?: Record; + scrapeList?: Record; + markdown?: Array<{ content: string }>; + html?: Array<{ content: string }>; + crawl?: Record; + search?: any; } const MAX_RETRIES = 3; @@ -67,6 +71,10 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput: const schemaData: Array<{ Group: string; Field: string; Value: any }> = []; const listData: any[] = []; const screenshotData: Array<{ key: string; url: string }> = []; + const markdownData: any[] = []; + const htmlData: any[] = []; + const crawlData: any[] = []; + const searchData: any[] = []; if (serializableOutput.scrapeSchema) { if (Array.isArray(serializableOutput.scrapeSchema)) { @@ -122,6 +130,66 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput: } } + if (serializableOutput.markdown && Array.isArray(serializableOutput.markdown)) { + serializableOutput.markdown.forEach((item, index) => { + if (item.content) { + markdownData.push({ + "Index": index + 1, + "Type": "Markdown", + "Content": item.content + }); + } + }); + } + + if (serializableOutput.html && Array.isArray(serializableOutput.html)) { + serializableOutput.html.forEach((item, index) => { + if (item.content) { + htmlData.push({ + "Index": index + 1, + "Type": "HTML", + "Content": item.content + }); + } + }); + } + + if (serializableOutput.crawl && typeof serializableOutput.crawl === "object") { + for (const [crawlName, crawlArray] of Object.entries(serializableOutput.crawl)) { + if (Array.isArray(crawlArray)) { + crawlArray.forEach((crawlItem) => { + const hasContent = Object.values(crawlItem || {}).some( + (value) => value !== null && value !== undefined && value !== "" + ); + if (hasContent) { + crawlData.push({ "Crawl Type": crawlName, ...crawlItem }); + } + }); + } + } + } + + if (serializableOutput.search) { + let results: any[] = []; + + if (serializableOutput.search.results && Array.isArray(serializableOutput.search.results)) { + results = serializableOutput.search.results; + } else if (Array.isArray(serializableOutput.search)) { + results = serializableOutput.search; + } else { + results = [serializableOutput.search]; + } + + results.forEach((result) => { + const hasContent = Object.values(result || {}).some( + (value) => value !== null && value !== undefined && value !== "" + ); + if (hasContent) { + searchData.push(result); + } + }); + } + // Collect screenshot data (handles both string and object forms safely) // if (binaryOutput && Object.keys(binaryOutput).length > 0) { // Object.entries(binaryOutput).forEach(([key, rawValue]: [string, any]) => { @@ -152,7 +220,15 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput: // } // --- Merge all types into Airtable rows --- - const maxLength = Math.max(schemaData.length, listData.length, screenshotData.length); + const maxLength = Math.max( + schemaData.length, + listData.length, + screenshotData.length, + markdownData.length, + htmlData.length, + crawlData.length, + searchData.length + ); for (let i = 0; i < maxLength; i++) { const record: Record = {}; @@ -176,6 +252,38 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput: record.Screenshot = screenshotData[i].url; } + if (i < markdownData.length) { + Object.entries(markdownData[i] || {}).forEach(([key, value]) => { + if (value !== null && value !== undefined && value !== "") { + record[key] = value; + } + }); + } + + if (i < htmlData.length) { + Object.entries(htmlData[i] || {}).forEach(([key, value]) => { + if (value !== null && value !== undefined && value !== "") { + record[key] = value; + } + }); + } + + if (i < crawlData.length) { + Object.entries(crawlData[i] || {}).forEach(([key, value]) => { + if (value !== null && value !== undefined && value !== "") { + record[key] = value; + } + }); + } + + if (i < searchData.length) { + Object.entries(searchData[i] || {}).forEach(([key, value]) => { + if (value !== null && value !== undefined && value !== "") { + record[key] = value; + } + }); + } + if (Object.keys(record).length > 0) { allRecords.push(record); } @@ -194,6 +302,18 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput: Screenshot: screenshotData[i].url, }); } + for (let i = maxLength; i < markdownData.length; i++) { + allRecords.push(markdownData[i]); + } + for (let i = maxLength; i < htmlData.length; i++) { + allRecords.push(htmlData[i]); + } + for (let i = maxLength; i < crawlData.length; i++) { + allRecords.push(crawlData[i]); + } + for (let i = maxLength; i < searchData.length; i++) { + allRecords.push(searchData[i]); + } return allRecords; } diff --git a/server/src/workflow-management/integrations/gsheet.ts b/server/src/workflow-management/integrations/gsheet.ts index b0871b75..c4ebb284 100644 --- a/server/src/workflow-management/integrations/gsheet.ts +++ b/server/src/workflow-management/integrations/gsheet.ts @@ -13,6 +13,10 @@ interface GoogleSheetUpdateTask { interface SerializableOutput { scrapeSchema?: Record; scrapeList?: Record; + markdown?: Array<{ content: string }>; + html?: Array<{ content: string }>; + crawl?: Record; + search?: any; } @@ -95,6 +99,72 @@ export async function updateGoogleSheet(robotId: string, runId: string) { } } + if (serializableOutput.markdown && Array.isArray(serializableOutput.markdown) && serializableOutput.markdown.length > 0) { + const markdownData = serializableOutput.markdown.map((item, index) => ({ + "Index": index + 1, + "Content": item.content || "" + })); + + await processOutputType( + robotId, + spreadsheetId, + 'Markdown', + markdownData, + plainRobot + ); + } + + if (serializableOutput.html && Array.isArray(serializableOutput.html) && serializableOutput.html.length > 0) { + const htmlData = serializableOutput.html.map((item, index) => ({ + "Index": index + 1, + "Content": item.content || "" + })); + + await processOutputType( + robotId, + spreadsheetId, + 'HTML', + htmlData, + plainRobot + ); + } + + if (serializableOutput.crawl && typeof serializableOutput.crawl === "object") { + for (const [crawlName, crawlArray] of Object.entries(serializableOutput.crawl)) { + if (!Array.isArray(crawlArray) || crawlArray.length === 0) continue; + + await processOutputType( + robotId, + spreadsheetId, + `Crawl - ${crawlName}`, + crawlArray, + plainRobot + ); + } + } + + if (serializableOutput.search) { + let searchData: any[] = []; + + if (serializableOutput.search.results && Array.isArray(serializableOutput.search.results)) { + searchData = serializableOutput.search.results; + } else if (Array.isArray(serializableOutput.search)) { + searchData = serializableOutput.search; + } else { + searchData = [serializableOutput.search]; + } + + if (searchData.length > 0) { + await processOutputType( + robotId, + spreadsheetId, + 'Search Results', + searchData, + plainRobot + ); + } + } + } if (plainRun.binaryOutput && Object.keys(plainRun.binaryOutput).length > 0) { diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts index 29999cff..8777ec7c 100644 --- a/server/src/workflow-management/scheduler/index.ts +++ b/server/src/workflow-management/scheduler/index.ts @@ -484,6 +484,8 @@ async function executeRun(id: string, userId: string) { const categorizedOutput = { scrapeSchema: finalRun?.serializableOutput?.scrapeSchema || {}, scrapeList: finalRun?.serializableOutput?.scrapeList || {}, + crawl: finalRun?.serializableOutput?.crawl || {}, + search: finalRun?.serializableOutput?.search || {} }; await destroyRemoteBrowser(plainRun.browserId, userId); @@ -570,6 +572,8 @@ async function executeRun(id: string, userId: string) { }, {} as Record) : {}, captured_lists: categorizedOutput.scrapeList, + crawl_data: categorizedOutput.crawl, + search_data: categorizedOutput.search, captured_texts_count: totalSchemaItemsExtracted, captured_lists_count: totalListItemsExtracted, screenshots_count: extractedScreenshotsCount diff --git a/src/api/storage.ts b/src/api/storage.ts index 4ac2f01b..75d53c48 100644 --- a/src/api/storage.ts +++ b/src/api/storage.ts @@ -335,4 +335,81 @@ export const deleteSchedule = async (id: string): Promise => { console.log(error); return false; } -} \ No newline at end of file +} + +export const createCrawlRobot = async ( + url: string, + name: string, + crawlConfig: { + mode: 'domain' | 'subdomain' | 'path'; + limit: number; + maxDepth: number; + includePaths: string[]; + excludePaths: string[]; + useSitemap: boolean; + followLinks: boolean; + respectRobots: boolean; + } +): Promise => { + try { + const response = await axios.post( + `${apiUrl}/storage/recordings/crawl`, + { + url, + name, + crawlConfig, + }, + { + headers: { 'Content-Type': 'application/json' }, + withCredentials: true, + } + ); + + if (response.status === 201) { + return response.data; + } else { + throw new Error('Failed to create crawl robot'); + } + } catch (error: any) { + console.error('Error creating crawl robot:', error); + return null; + } +}; + +export const createSearchRobot = async ( + name: string, + searchConfig: { + query: string; + limit: number; + provider: 'google' | 'bing' | 'duckduckgo'; + filters?: { + timeRange?: 'day' | 'week' | 'month' | 'year'; + location?: string; + lang?: string; + }; + mode: 'discover' | 'scrape'; + } +): Promise => { + try { + const response = await axios.post( + `${apiUrl}/storage/recordings/search`, + { + name, + searchConfig, + }, + { + headers: { 'Content-Type': 'application/json' }, + withCredentials: true, + } + ); + + if (response.status === 201) { + return response.data; + } else { + throw new Error('Failed to create search robot'); + } + } catch (error: any) { + console.error('Error creating search robot:', error); + return null; + } +}; \ No newline at end of file diff --git a/src/components/robot/pages/RobotConfigPage.tsx b/src/components/robot/pages/RobotConfigPage.tsx index 465bea14..bd3e606b 100644 --- a/src/components/robot/pages/RobotConfigPage.tsx +++ b/src/components/robot/pages/RobotConfigPage.tsx @@ -154,7 +154,7 @@ export const RobotConfigPage: React.FC = ({ )} - /* {showCancelButton && ( + {/* {showCancelButton && ( - )} */ + )} */} {showSaveButton && onSave && ( + + + + + + Crawl Scope + + + + setCrawlMaxDepth(parseInt(e.target.value) || 3)} + sx={{ mb: 2 }} + helperText="How many links deep to follow (default: 3)" + FormHelperTextProps={{ sx: { ml: 0 } }} + /> + + setCrawlIncludePaths(e.target.value)} + sx={{ mb: 2 }} + helperText="Only crawl URLs matching these paths (comma-separated)" + FormHelperTextProps={{ sx: { ml: 0 } }} + /> + + setCrawlExcludePaths(e.target.value)} + sx={{ mb: 2 }} + helperText="Skip URLs matching these paths (comma-separated)" + FormHelperTextProps={{ sx: { ml: 0 } }} + /> + + + setCrawlUseSitemap(e.target.checked)} + /> + } + label="Use sitemap.xml for URL discovery" + /> + setCrawlFollowLinks(e.target.checked)} + /> + } + label="Follow links on pages" + /> + setCrawlRespectRobots(e.target.checked)} + /> + } + label="Respect robots.txt" + /> + + + + + + + + + + + + + + Maxun Logo + + + Search the web and gather data from relevant results. + + + + setSearchRobotName(e.target.value)} + sx={{ mb: 2 }} + /> + + setSearchQuery(e.target.value)} + sx={{ mb: 2 }} + /> + + setSearchLimit(parseInt(e.target.value) || 10)} + sx={{ mb: 2 }} + /> + + + + Mode + + + + + Time Range + + + + + + + + + diff --git a/src/components/robot/pages/RobotDuplicatePage.tsx b/src/components/robot/pages/RobotDuplicatePage.tsx index f021ee45..8607b41b 100644 --- a/src/components/robot/pages/RobotDuplicatePage.tsx +++ b/src/components/robot/pages/RobotDuplicatePage.tsx @@ -24,7 +24,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'extract' | 'scrape'; + type?: 'extract' | 'scrape' | 'crawl' | 'search'; url?: string; formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[]; isLLM?: boolean; diff --git a/src/components/robot/pages/RobotEditPage.tsx b/src/components/robot/pages/RobotEditPage.tsx index 8914f4aa..310541d0 100644 --- a/src/components/robot/pages/RobotEditPage.tsx +++ b/src/components/robot/pages/RobotEditPage.tsx @@ -1,4 +1,4 @@ -import React, { useState, useEffect } from "react"; +import { useState, useEffect } from "react"; import { useTranslation } from "react-i18next"; import { TextField, @@ -7,7 +7,13 @@ import { Button, IconButton, InputAdornment, - Divider, + FormControl, + InputLabel, + Select, + MenuItem, + FormControlLabel, + Checkbox, + Collapse } from "@mui/material"; import { Visibility, VisibilityOff } from "@mui/icons-material"; import { useGlobalInfoStore } from "../../../context/globalInfo"; @@ -24,7 +30,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'extract' | 'scrape'; + type?: 'extract' | 'scrape' | 'crawl' | 'search'; url?: string; formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[]; isLLM?: boolean; @@ -97,6 +103,25 @@ interface ScrapeListLimit { currentLimit: number; } +interface CrawlConfig { + mode?: string; + limit?: number; + maxDepth?: number; + useSitemap?: boolean; + followLinks?: boolean; + excludePaths?: string[]; + includePaths?: string[]; + respectRobots?: boolean; +} + +interface SearchConfig { + mode?: 'discover' | 'scrape'; + limit?: number; + query?: string; + filters?: Record; + provider?: string; +} + export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { const { t } = useTranslation(); const navigate = useNavigate(); @@ -115,6 +140,9 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { [] ); const [isLoading, setIsLoading] = useState(false); + const [crawlConfig, setCrawlConfig] = useState({}); + const [searchConfig, setSearchConfig] = useState({}); + const [showCrawlAdvanced, setShowCrawlAdvanced] = useState(false); const isEmailPattern = (value: string): boolean => { return value.includes("@"); @@ -163,6 +191,8 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { setCredentialGroups(groupCredentialsByType(extractedCredentials)); findScrapeListLimits(robot.recording.workflow); + extractCrawlConfig(robot.recording.workflow); + extractSearchConfig(robot.recording.workflow); } }, [robot]); @@ -195,6 +225,36 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { setScrapeListLimits(limits); }; + const extractCrawlConfig = (workflow: WhereWhatPair[]) => { + workflow.forEach((pair) => { + if (!pair.what) return; + + pair.what.forEach((action: any) => { + if (action.action === "crawl" && action.args && action.args.length > 0) { + const config = action.args[0]; + if (config && typeof config === "object") { + setCrawlConfig(config as CrawlConfig); + } + } + }); + }); + }; + + const extractSearchConfig = (workflow: WhereWhatPair[]) => { + workflow.forEach((pair) => { + if (!pair.what) return; + + pair.what.forEach((action: any) => { + if (action.action === "search" && action.args && action.args.length > 0) { + const config = action.args[0]; + if (config && typeof config === "object") { + setSearchConfig(config as SearchConfig); + } + } + }); + }); + }; + function extractInitialCredentials(workflow: any[]): Credentials { const credentials: Credentials = {}; @@ -475,19 +535,17 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { <> {renderCredentialFields( credentialGroups.usernames, - t("Username"), - "text" + t("Username") )} - {renderCredentialFields(credentialGroups.emails, t("Email"), "text")} + {renderCredentialFields(credentialGroups.emails, t("Email"))} {renderCredentialFields( credentialGroups.passwords, - t("Password"), - "password" + t("Password") )} - {renderCredentialFields(credentialGroups.others, t("Other"), "text")} + {renderCredentialFields(credentialGroups.others, t("Other"))} ); }; @@ -502,7 +560,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { {scrapeListLimits.map((limitInfo, index) => { - // Get the corresponding scrapeList action to extract its name const scrapeListAction = robot?.recording?.workflow?.[limitInfo.pairIndex]?.what?.[limitInfo.actionIndex]; const actionName = scrapeListAction?.name || @@ -542,7 +599,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { const screenshotInputs: JSX.Element[] = []; const listInputs: JSX.Element[] = []; - let textCount = 0; let screenshotCount = 0; let listCount = 0; @@ -683,7 +739,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { const renderCredentialFields = ( selectors: string[], headerText: string, - defaultType: "text" | "password" = "text" ) => { if (selectors.length === 0) return null; @@ -737,6 +792,193 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { return url; }; + const renderCrawlConfigFields = () => { + if (robot?.recording_meta.type !== 'crawl') return null; + + return ( + <> + { + const value = parseInt(e.target.value, 10); + if (value >= 1) { + setCrawlConfig((prev) => ({ ...prev, limit: value })); + } + }} + inputProps={{ min: 1 }} + style={{ marginBottom: "20px" }} + /> + + + + + + + Crawl Scope + + + + { + const value = parseInt(e.target.value, 10); + if (value >= 1) { + setCrawlConfig((prev) => ({ ...prev, maxDepth: value })); + } + }} + inputProps={{ min: 1 }} + sx={{ mb: 2 }} + helperText="How many links deep to follow (default: 3)" + /> + + { + const paths = e.target.value ? e.target.value.split(',').map(p => p.trim()) : []; + setCrawlConfig((prev) => ({ ...prev, includePaths: paths })); + }} + sx={{ mb: 2 }} + helperText="Only crawl URLs matching these paths (comma-separated)" + /> + + { + const paths = e.target.value ? e.target.value.split(',').map(p => p.trim()) : []; + setCrawlConfig((prev) => ({ ...prev, excludePaths: paths })); + }} + sx={{ mb: 2 }} + helperText="Skip URLs matching these paths (comma-separated)" + /> + + + setCrawlConfig((prev) => ({ ...prev, useSitemap: e.target.checked }))} + /> + } + label="Use sitemap.xml for URL discovery" + /> + setCrawlConfig((prev) => ({ ...prev, followLinks: e.target.checked }))} + /> + } + label="Follow links on pages" + /> + setCrawlConfig((prev) => ({ ...prev, respectRobots: e.target.checked }))} + /> + } + label="Respect robots.txt" + /> + + + + + ); + }; + + const renderSearchConfigFields = () => { + if (robot?.recording_meta.type !== 'search') return null; + + return ( + <> + { + setSearchConfig((prev) => ({ ...prev, query: e.target.value })); + }} + sx={{ mb: 2 }} + /> + + { + const value = parseInt(e.target.value, 10); + if (value >= 1) { + setSearchConfig((prev) => ({ ...prev, limit: value })); + } + }} + inputProps={{ min: 1 }} + sx={{ mb: 2 }} + /> + + + Mode + + + + + Time Range + + + + ); + }; + const handleSave = async () => { if (!robot) return; @@ -757,6 +999,48 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { const targetUrl = getTargetUrl(); + let updatedWorkflow = robot.recording.workflow; + if (robot.recording_meta.type === 'crawl') { + updatedWorkflow = updatedWorkflow.map((pair: any) => { + if (!pair.what) return pair; + + return { + ...pair, + what: pair.what.map((action: any) => { + if (action.action === 'crawl') { + return { + ...action, + args: [{ ...crawlConfig }] + }; + } + return action; + }) + }; + }); + } + + if (robot.recording_meta.type === 'search') { + updatedWorkflow = updatedWorkflow.map((pair: any) => { + if (!pair.what) return pair; + + return { + ...pair, + what: pair.what.map((action: any) => { + if (action.action === 'search') { + return { + ...action, + args: [{ + ...searchConfig, + provider: 'duckduckgo' + }] + }; + } + return action; + }) + }; + }); + } + const payload: any = { name: robot.recording_meta.name, limits: scrapeListLimits.map((limit) => ({ @@ -767,8 +1051,7 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { })), credentials: credentialsForPayload, targetUrl: targetUrl, - // send the (possibly edited) workflow so backend can persist action name changes - workflow: robot.recording.workflow, + workflow: updatedWorkflow, }; const success = await updateRecording(robot.recording_meta.id, payload); @@ -818,26 +1101,21 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { style={{ marginBottom: "20px" }} /> - handleTargetUrlChange(e.target.value)} - style={{ marginBottom: "20px" }} - /> - {renderScrapeListLimitFields() && ( - <> - - {renderScrapeListLimitFields()} - + {robot.recording_meta.type !== 'search' && ( + handleTargetUrlChange(e.target.value)} + style={{ marginBottom: "20px" }} + /> )} + + {renderCrawlConfigFields()} + {renderSearchConfigFields()} - {renderActionNameFields() && ( - <> - - {renderActionNameFields()} - - )} + {renderScrapeListLimitFields()} + {renderActionNameFields()} )} diff --git a/src/components/robot/pages/RobotSettingsPage.tsx b/src/components/robot/pages/RobotSettingsPage.tsx index a5618d4c..684107e8 100644 --- a/src/components/robot/pages/RobotSettingsPage.tsx +++ b/src/components/robot/pages/RobotSettingsPage.tsx @@ -1,7 +1,6 @@ -import React, { useState, useEffect } from "react"; +import { useState, useEffect } from "react"; import { useTranslation } from "react-i18next"; -import { TextField, Typography, Box, Card, CardContent } from "@mui/material"; -import { Settings, Info } from "@mui/icons-material"; +import { TextField, Box } from "@mui/material"; import { useGlobalInfoStore } from "../../../context/globalInfo"; import { getStoredRecording } from "../../../api/storage"; import { WhereWhatPair } from "maxun-core"; @@ -16,7 +15,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'extract' | 'scrape'; + type?: 'extract' | 'scrape' | 'crawl' | 'search'; url?: string; formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[]; isLLM?: boolean; @@ -116,19 +115,11 @@ export const RobotSettingsPage = ({ handleStart }: RobotSettingsProps) => { fetchUserEmail(); }, [robot?.userId]); - const handleCancel = () => { - const basePath = location.pathname.includes("/prebuilt-robots") - ? "/prebuilt-robots" - : "/robots"; - navigate(basePath); - }; - const targetUrl = getTargetUrl(); return ( { {robot && ( <> - + {robot.recording_meta.type !== 'search' && ( + + )} { const { t } = useTranslation(); +export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRef, abortRunHandler, workflowProgress }: RunContentProps) => { + const { t } = useTranslation(); + const { darkMode } = useThemeMode(); const [tab, setTab] = React.useState('output'); const [markdownContent, setMarkdownContent] = useState(''); const [htmlContent, setHtmlContent] = useState(''); @@ -54,6 +56,15 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe const [listKeys, setListKeys] = useState([]); const [currentListIndex, setCurrentListIndex] = useState(0); + const [crawlData, setCrawlData] = useState([]); + const [crawlColumns, setCrawlColumns] = useState([]); + const [crawlKeys, setCrawlKeys] = useState([]); + const [currentCrawlIndex, setCurrentCrawlIndex] = useState(0); + + const [searchData, setSearchData] = useState([]); + const [searchMode, setSearchMode] = useState<'discover' | 'scrape'>('discover'); + const [currentSearchIndex, setCurrentSearchIndex] = useState(0); + const [screenshotKeys, setScreenshotKeys] = useState([]); const [screenshotKeyMap, setScreenshotKeyMap] = useState>({}); const [currentScreenshotIndex, setCurrentScreenshotIndex] = useState(0); @@ -106,6 +117,10 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe setListData([]); setListColumns([]); setListKeys([]); + setCrawlData([]); + setCrawlColumns([]); + setCrawlKeys([]); + setSearchData([]); setLegacyData([]); setLegacyColumns([]); setIsLegacyData(false); @@ -117,7 +132,7 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe const hasLegacySchema = row.serializableOutput.scrapeSchema && Array.isArray(row.serializableOutput.scrapeSchema); const hasLegacyList = row.serializableOutput.scrapeList && Array.isArray(row.serializableOutput.scrapeList); - const hasOldFormat = !row.serializableOutput.scrapeSchema && !row.serializableOutput.scrapeList && Object.keys(row.serializableOutput).length > 0; + const hasOldFormat = !row.serializableOutput.scrapeSchema && !row.serializableOutput.scrapeList && !row.serializableOutput.crawl && !row.serializableOutput.search && Object.keys(row.serializableOutput).length > 0; if (hasLegacySchema || hasLegacyList || hasOldFormat) { processLegacyData(row.serializableOutput); @@ -134,6 +149,14 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe if (row.serializableOutput.scrapeList) { processScrapeList(row.serializableOutput.scrapeList); } + + if (row.serializableOutput.crawl) { + processCrawl(row.serializableOutput.crawl); + } + + if (row.serializableOutput.search) { + processSearch(row.serializableOutput.search); + } }, [row.serializableOutput, row.status]); useEffect(() => { @@ -152,7 +175,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe let normalizedScreenshotKeys: string[]; if (isLegacyPattern) { - // Legacy unnamed screenshots → Screenshot 1, Screenshot 2... normalizedScreenshotKeys = rawKeys.map((_, index) => `Screenshot ${index + 1}`); } else { normalizedScreenshotKeys = rawKeys.map((key, index) => { @@ -355,6 +377,76 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe setCurrentListIndex(0); }; + const processCrawl = (crawlDataInput: any) => { + const tablesList: any[][] = []; + const columnsList: string[][] = []; + const keys: string[] = []; + + if (typeof crawlDataInput === 'object') { + Object.keys(crawlDataInput).forEach(key => { + const tableData = crawlDataInput[key]; + + if (Array.isArray(tableData) && tableData.length > 0) { + const filteredData = tableData.filter(row => + row && typeof row === 'object' && Object.values(row).some(value => value !== undefined && value !== "") + ); + + if (filteredData.length > 0) { + tablesList.push(filteredData); + keys.push(key); + const tableColumns = new Set(); + filteredData.forEach(item => { + Object.keys(item).forEach(key => tableColumns.add(key)); + }); + columnsList.push(Array.from(tableColumns)); + } + } + }); + } + + setCrawlData(tablesList); + setCrawlColumns(columnsList); + const normalizedCrawlKeys = keys.map((key, index) => { + if (!key || key.toLowerCase().includes("crawl")) { + return `Crawl ${index + 1}`; + } + return key; + }); + + setCrawlKeys(normalizedCrawlKeys); + setCurrentCrawlIndex(0); + }; + + const processSearch = (searchDataInput: any) => { + if (typeof searchDataInput === 'object') { + const keys = Object.keys(searchDataInput); + + if (keys.length > 0) { + const searchKey = keys[0]; + const searchInfo = searchDataInput[searchKey]; + + if (searchInfo && searchInfo.results && Array.isArray(searchInfo.results)) { + const mode = searchInfo.mode || 'discover'; + setSearchMode(mode); + + if (mode === 'scrape') { + setSearchData(searchInfo.results); + } else { + const normalizedResults = searchInfo.results.map((result: any, index: number) => ({ + title: result.title || '-', + url: result.url || '-', + description: result.description || '-', + position: result.position || index + 1, + })); + setSearchData(normalizedResults); + } + + setCurrentSearchIndex(0); + } + } + } + }; + const convertToCSV = (data: any[], columns: string[], isSchemaData: boolean = false, isTabular: boolean = false): string => { if (isSchemaData && !isTabular && data.length === 1) { const header = 'Label,Value'; @@ -375,7 +467,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe } }; - // Function to download a specific dataset as CSV const downloadCSV = (data: any[], columns: string[], filename: string, isSchemaData: boolean = false, isTabular: boolean = false) => { const csvContent = convertToCSV(data, columns, isSchemaData, isTabular); const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' }); @@ -426,6 +517,33 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe }, 100); }; + const downloadAllCrawlsAsZip = async (crawlDataArray: any[], zipFilename: string) => { + const zip = new JSZip(); + + crawlDataArray.forEach((item, index) => { + const url = item?.metadata?.url || item?.url || ''; + const filename = url + ? url.replace(/^https?:\/\//, '').replace(/\//g, '_').replace(/[^a-zA-Z0-9_.-]/g, '_') + '.json' + : `crawl_url_${index + 1}.json`; + + const jsonContent = JSON.stringify(item, null, 2); + zip.file(filename, jsonContent); + }); + + const blob = await zip.generateAsync({ type: 'blob' }); + const url = URL.createObjectURL(blob); + + const link = document.createElement("a"); + link.href = url; + link.setAttribute("download", zipFilename); + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + + setTimeout(() => { + URL.revokeObjectURL(url); + }, 100); + }; const renderDataTable = ( data: any[], @@ -433,14 +551,13 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe title: string, csvFilename: string, jsonFilename: string, - isPaginatedList: boolean = false, isSchemaData: boolean = false ) => { if (data.length === 0) return null; const shouldShowAsKeyValue = isSchemaData && !isSchemaTabular && data.length === 1; - if (title === '') { + if (!title || title.trim() === '') { return ( <> @@ -686,7 +803,7 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe ); }; - const hasData = schemaData.length > 0 || listData.length > 0 || legacyData.length > 0; + const hasData = schemaData.length > 0 || listData.length > 0 || crawlData.length > 0 || searchData.length > 0 || legacyData.length > 0; const hasScreenshots = row.binaryOutput && Object.keys(row.binaryOutput).length > 0; const hasMarkdown = markdownContent.length > 0; const hasHTML = htmlContent.length > 0; @@ -818,7 +935,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe )} ) : ( - // Extract robot output <> {row.status === 'running' || row.status === 'queued' ? ( <> @@ -910,7 +1026,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe '', `${schemaKeys[currentSchemaIndex] || 'schema_data'}.csv`, `${schemaKeys[currentSchemaIndex] || 'schema_data'}.json`, - false, true )} @@ -1059,6 +1174,588 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe )} )} + + {crawlData.length > 0 && crawlData[0] && crawlData[0].length > 0 && ( + + }> + + + Crawl Results + + + + + + {crawlData[0].map((item: any, idx: number) => { + const url = item?.metadata?.url || item?.url || `URL ${idx + 1}`; + + return ( + setCurrentCrawlIndex(idx)} + sx={{ + px: 2, + py: 1, + cursor: 'pointer', + backgroundColor: currentCrawlIndex === idx + ? darkMode ? '#121111ff' : '#e9ecef' + : 'transparent', + borderBottom: currentCrawlIndex === idx ? '3px solid #FF00C3' : 'none', + color: darkMode ? '#fff' : '#000', + whiteSpace: 'nowrap', + fontSize: '0.875rem', + flexShrink: 0, + }} + title={url} + > + Link {idx + 1} + + ); + })} + + + {crawlData[0][currentCrawlIndex] && ( + <> + + }> + + + Metadata + + + + + + + + {crawlData[0][currentCrawlIndex].metadata && + Object.entries(crawlData[0][currentCrawlIndex].metadata).map(([key, value]: [string, any]) => ( + + + {key} + + + {value === undefined || value === '' ? '-' : String(value)} + + + )) + } + +
+
+
+
+ + {crawlData[0][currentCrawlIndex].text && ( + + }> + + + Text Content + + + + + + + {crawlData[0][currentCrawlIndex].text} + + + + + )} + + {crawlData[0][currentCrawlIndex].html && ( + + }> + + + HTML + + + + + + + {crawlData[0][currentCrawlIndex].html} + + + + + )} + + {crawlData[0][currentCrawlIndex].links && crawlData[0][currentCrawlIndex].links.length > 0 && ( + + }> + + + Links ({crawlData[0][currentCrawlIndex].links.length}) + + + + + + {crawlData[0][currentCrawlIndex].links.map((link: string, idx: number) => ( + + {link} + + ))} + + + + )} + + + + + + + + )} +
+
+ )} + + {searchData.length > 0 && ( + + }> + + + Search Results + + + + + {searchMode === 'scrape' && searchData.length > 0 ? ( + <> + + {searchData.map((item: any, idx: number) => { + const url = item?.metadata?.url || item?.url || `Result ${idx + 1}`; + + return ( + setCurrentSearchIndex(idx)} + sx={{ + px: 2, + py: 1, + cursor: 'pointer', + backgroundColor: currentSearchIndex === idx + ? darkMode ? '#121111ff' : '#e9ecef' + : 'transparent', + borderBottom: currentSearchIndex === idx ? '3px solid #FF00C3' : 'none', + color: darkMode ? '#fff' : '#000', + whiteSpace: 'nowrap', + fontSize: '0.875rem', + flexShrink: 0, + }} + title={url} + > + Link {idx + 1} + + ); + })} + + + {searchData[currentSearchIndex] && ( + <> + + }> + + + Metadata + + + + + + + + {searchData[currentSearchIndex].metadata && + Object.entries(searchData[currentSearchIndex].metadata).map(([key, value]: [string, any]) => ( + + + {key} + + + {value === undefined || value === '' ? '-' : String(value)} + + + )) + } + +
+
+
+
+ + {searchData[currentSearchIndex].text && ( + + }> + + + Text Content + + + + + + + {searchData[currentSearchIndex].text} + + + + + )} + + {searchData[currentSearchIndex].html && ( + + }> + + + HTML + + + + + + + {searchData[currentSearchIndex].html} + + + + + )} + + {searchData[currentSearchIndex].markdown && ( + + }> + + + Markdown + + + + + + + {searchData[currentSearchIndex].markdown} + + + + + )} + + {searchData[currentSearchIndex].links && searchData[currentSearchIndex].links.length > 0 && ( + + }> + + + Links ({searchData[currentSearchIndex].links.length}) + + + + + + {searchData[currentSearchIndex].links.map((link: string, idx: number) => ( + + {link} + + ))} + + + + )} + + + + + + )} + + ) : ( + <> + + + + + + Title + + + URL + + + Description + + + + + + {searchData.map((result: any, idx: number) => ( + + + {result.title || '-'} + + + {result.url ? ( + + {result.url} + + ) : '-'} + + + {result.description || '-'} + + + ))} + +
+
+ + + + + + )} +
+
+ )}
)} diff --git a/src/components/run/RunsTable.tsx b/src/components/run/RunsTable.tsx index 9af72d14..0661cf8d 100644 --- a/src/components/run/RunsTable.tsx +++ b/src/components/run/RunsTable.tsx @@ -56,6 +56,7 @@ export interface Data { runByScheduleId?: string; browserId: string; runByAPI?: boolean; + runBySDK?: boolean; log: string; runId: string; robotId: string; diff --git a/src/context/globalInfo.tsx b/src/context/globalInfo.tsx index 5db9239d..5f958d75 100644 --- a/src/context/globalInfo.tsx +++ b/src/context/globalInfo.tsx @@ -27,7 +27,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'extract' | 'scrape'; + type?: 'extract' | 'scrape' | 'crawl' | 'search'; url?: string; formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[]; isLLM?: boolean;