Merge pull request #938 from getmaxun/crawl-search
feat: add crawl and search
This commit is contained in:
@@ -80,7 +80,9 @@ export default class Interpreter extends EventEmitter {
|
||||
|
||||
private serializableDataByType: Record<string, Record<string, any>> = {
|
||||
scrapeList: {},
|
||||
scrapeSchema: {}
|
||||
scrapeSchema: {},
|
||||
crawl: {},
|
||||
search: {}
|
||||
};
|
||||
|
||||
private scrapeListCounter: number = 0;
|
||||
@@ -570,7 +572,9 @@ export default class Interpreter extends EventEmitter {
|
||||
|
||||
await this.options.serializableCallback({
|
||||
scrapeList: this.serializableDataByType.scrapeList,
|
||||
scrapeSchema: this.serializableDataByType.scrapeSchema
|
||||
scrapeSchema: this.serializableDataByType.scrapeSchema,
|
||||
crawl: this.serializableDataByType.crawl || {},
|
||||
search: this.serializableDataByType.search || {}
|
||||
});
|
||||
},
|
||||
|
||||
@@ -708,6 +712,750 @@ export default class Interpreter extends EventEmitter {
|
||||
}
|
||||
},
|
||||
|
||||
crawl: async (crawlConfig: {
|
||||
mode: 'domain' | 'subdomain' | 'path';
|
||||
limit: number;
|
||||
maxDepth: number;
|
||||
includePaths: string[];
|
||||
excludePaths: string[];
|
||||
useSitemap: boolean;
|
||||
followLinks: boolean;
|
||||
respectRobots: boolean;
|
||||
}) => {
|
||||
if (this.isAborted) {
|
||||
this.log('Workflow aborted, stopping crawl', Level.WARN);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
if (this.options.debugChannel?.setActionType) {
|
||||
this.options.debugChannel.setActionType('crawl');
|
||||
}
|
||||
|
||||
this.log('Starting crawl operation', Level.LOG);
|
||||
|
||||
try {
|
||||
const currentUrl = page.url();
|
||||
this.log(`Current page URL: ${currentUrl}`, Level.LOG);
|
||||
|
||||
if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') {
|
||||
this.log('Page not yet navigated, waiting for navigation...', Level.WARN);
|
||||
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
|
||||
}
|
||||
|
||||
const baseUrl = page.url();
|
||||
this.log(`Using base URL for crawl: ${baseUrl}`, Level.LOG);
|
||||
|
||||
const parsedBase = new URL(baseUrl);
|
||||
const baseDomain = parsedBase.hostname;
|
||||
|
||||
let discoveredUrls: string[] = [];
|
||||
|
||||
if (crawlConfig.useSitemap) {
|
||||
this.log('Fetching sitemap URLs...', Level.LOG);
|
||||
try {
|
||||
const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
|
||||
|
||||
const sitemapUrls = await page.evaluate((url) => {
|
||||
return new Promise<string[]>((resolve) => {
|
||||
const xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', url, true);
|
||||
xhr.onload = function() {
|
||||
if (xhr.status === 200) {
|
||||
const text = xhr.responseText;
|
||||
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
|
||||
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
|
||||
resolve(urls);
|
||||
} else {
|
||||
resolve([]);
|
||||
}
|
||||
};
|
||||
xhr.onerror = function() {
|
||||
resolve([]);
|
||||
};
|
||||
xhr.send();
|
||||
});
|
||||
}, sitemapUrl);
|
||||
|
||||
if (sitemapUrls.length > 0) {
|
||||
const nestedSitemaps = sitemapUrls.filter(url =>
|
||||
url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/')
|
||||
);
|
||||
const regularUrls = sitemapUrls.filter(url =>
|
||||
!url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/')
|
||||
);
|
||||
|
||||
discoveredUrls.push(...regularUrls);
|
||||
this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, Level.LOG);
|
||||
|
||||
for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
|
||||
try {
|
||||
this.log(`Fetching nested sitemap: ${nestedUrl}`, Level.LOG);
|
||||
const nestedUrls = await page.evaluate((url) => {
|
||||
return new Promise<string[]>((resolve) => {
|
||||
const xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', url, true);
|
||||
xhr.onload = function() {
|
||||
if (xhr.status === 200) {
|
||||
const text = xhr.responseText;
|
||||
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
|
||||
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
|
||||
resolve(urls);
|
||||
} else {
|
||||
resolve([]);
|
||||
}
|
||||
};
|
||||
xhr.onerror = function() {
|
||||
resolve([]);
|
||||
};
|
||||
xhr.send();
|
||||
});
|
||||
}, nestedUrl);
|
||||
|
||||
if (nestedUrls.length > 0) {
|
||||
discoveredUrls.push(...nestedUrls);
|
||||
this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, Level.LOG);
|
||||
}
|
||||
} catch (error) {
|
||||
this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, Level.WARN);
|
||||
}
|
||||
}
|
||||
|
||||
this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, Level.LOG);
|
||||
} else {
|
||||
this.log('No URLs found in sitemap or sitemap not available', Level.WARN);
|
||||
}
|
||||
} catch (error) {
|
||||
this.log(`Sitemap fetch failed: ${error.message}`, Level.WARN);
|
||||
}
|
||||
}
|
||||
|
||||
if (crawlConfig.followLinks) {
|
||||
this.log('Extracting links from current page...', Level.LOG);
|
||||
try {
|
||||
await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
|
||||
|
||||
await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
|
||||
this.log('Network did not become idle, continuing anyway', Level.WARN);
|
||||
});
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 5000));
|
||||
|
||||
const anchorCount = await page.evaluate(() => {
|
||||
return document.querySelectorAll('a').length;
|
||||
});
|
||||
this.log(`Page has ${anchorCount} total anchor tags`, Level.LOG);
|
||||
|
||||
const pageLinks = await page.evaluate(() => {
|
||||
const links: string[] = [];
|
||||
const allAnchors = document.querySelectorAll('a');
|
||||
console.log('Total anchors found:', allAnchors.length);
|
||||
|
||||
for (let i = 0; i < allAnchors.length; i++) {
|
||||
const anchor = allAnchors[i] as HTMLAnchorElement;
|
||||
const href = anchor.getAttribute('href');
|
||||
const fullHref = anchor.href;
|
||||
|
||||
if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
|
||||
links.push(fullHref);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('Links extracted:', links.length);
|
||||
return links;
|
||||
});
|
||||
|
||||
discoveredUrls.push(...pageLinks);
|
||||
this.log(`Found ${pageLinks.length} links from page`, Level.LOG);
|
||||
} catch (error) {
|
||||
this.log(`Link extraction failed: ${error.message}`, Level.WARN);
|
||||
}
|
||||
}
|
||||
|
||||
const filteredUrls = discoveredUrls.filter(url => {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
|
||||
if (crawlConfig.mode === 'domain') {
|
||||
if (urlObj.hostname !== baseDomain) return false;
|
||||
} else if (crawlConfig.mode === 'subdomain') {
|
||||
if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain) return false;
|
||||
} else if (crawlConfig.mode === 'path') {
|
||||
if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname)) return false;
|
||||
}
|
||||
|
||||
if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
|
||||
const matches = crawlConfig.includePaths.some(pattern => {
|
||||
const regex = new RegExp(pattern);
|
||||
return regex.test(url);
|
||||
});
|
||||
if (!matches) return false;
|
||||
}
|
||||
|
||||
if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
|
||||
const matches = crawlConfig.excludePaths.some(pattern => {
|
||||
const regex = new RegExp(pattern);
|
||||
return regex.test(url);
|
||||
});
|
||||
if (matches) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
return false;
|
||||
}
|
||||
});
|
||||
|
||||
const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
|
||||
return url.replace(/#.*$/, '').replace(/\/$/, '');
|
||||
})));
|
||||
|
||||
const basePathname = parsedBase.pathname;
|
||||
const prioritizedUrls = uniqueUrls.sort((a, b) => {
|
||||
try {
|
||||
const aUrl = new URL(a);
|
||||
const bUrl = new URL(b);
|
||||
const aMatchesBase = aUrl.pathname.startsWith(basePathname);
|
||||
const bMatchesBase = bUrl.pathname.startsWith(basePathname);
|
||||
|
||||
if (aMatchesBase && !bMatchesBase) return -1;
|
||||
if (!aMatchesBase && bMatchesBase) return 1;
|
||||
|
||||
return 0;
|
||||
} catch (error) {
|
||||
return 0;
|
||||
}
|
||||
});
|
||||
|
||||
const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
|
||||
|
||||
this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, Level.LOG);
|
||||
|
||||
this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, Level.LOG);
|
||||
const crawlResults = [];
|
||||
|
||||
for (let i = 0; i < finalUrls.length; i++) {
|
||||
const url = finalUrls[i];
|
||||
try {
|
||||
this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, Level.LOG);
|
||||
|
||||
await page.goto(url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: 30000
|
||||
}).catch(() => {
|
||||
this.log(`Failed to navigate to ${url}, skipping...`, Level.WARN);
|
||||
});
|
||||
|
||||
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
|
||||
|
||||
const pageData = await page.evaluate(() => {
|
||||
const getMeta = (name: string) => {
|
||||
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
||||
return meta?.getAttribute('content') || '';
|
||||
};
|
||||
|
||||
const getAllMeta = () => {
|
||||
const metadata: Record<string, string> = {};
|
||||
const metaTags = document.querySelectorAll('meta');
|
||||
metaTags.forEach(tag => {
|
||||
const name = tag.getAttribute('name') || tag.getAttribute('property');
|
||||
const content = tag.getAttribute('content');
|
||||
if (name && content) {
|
||||
metadata[name] = content;
|
||||
}
|
||||
});
|
||||
return metadata;
|
||||
};
|
||||
|
||||
const title = document.title || '';
|
||||
const bodyText = document.body?.innerText || '';
|
||||
|
||||
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
|
||||
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
||||
|
||||
const html = document.documentElement.outerHTML;
|
||||
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
|
||||
const allMetadata = getAllMeta();
|
||||
|
||||
return {
|
||||
title,
|
||||
description: getMeta('description'),
|
||||
text: bodyText,
|
||||
html: html,
|
||||
links: links,
|
||||
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
|
||||
metadata: {
|
||||
...allMetadata,
|
||||
title,
|
||||
language: document.documentElement.lang || '',
|
||||
favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '',
|
||||
statusCode: 200
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
crawlResults.push({
|
||||
metadata: {
|
||||
...pageData.metadata,
|
||||
url: url,
|
||||
sourceURL: url
|
||||
},
|
||||
html: pageData.html,
|
||||
text: pageData.text,
|
||||
links: pageData.links,
|
||||
wordCount: pageData.wordCount,
|
||||
scrapedAt: new Date().toISOString()
|
||||
});
|
||||
|
||||
this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, Level.LOG);
|
||||
|
||||
} catch (error) {
|
||||
this.log(`Failed to scrape ${url}: ${error.message}`, Level.WARN);
|
||||
crawlResults.push({
|
||||
url: url,
|
||||
error: error.message,
|
||||
scrapedAt: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
this.log(`Successfully scraped ${crawlResults.length} pages`, Level.LOG);
|
||||
|
||||
const actionType = "crawl";
|
||||
const actionName = "Crawl Results";
|
||||
|
||||
if (!this.serializableDataByType[actionType]) {
|
||||
this.serializableDataByType[actionType] = {};
|
||||
}
|
||||
if (!this.serializableDataByType[actionType][actionName]) {
|
||||
this.serializableDataByType[actionType][actionName] = [];
|
||||
}
|
||||
|
||||
this.serializableDataByType[actionType][actionName] = crawlResults;
|
||||
|
||||
await this.options.serializableCallback({
|
||||
scrapeList: this.serializableDataByType.scrapeList || {},
|
||||
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
|
||||
crawl: this.serializableDataByType.crawl || {},
|
||||
search: this.serializableDataByType.search || {}
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
this.log(`Crawl action failed: ${error.message}`, Level.ERROR);
|
||||
throw new Error(`Crawl execution error: ${error.message}`);
|
||||
}
|
||||
},
|
||||
|
||||
search: async (searchConfig: {
|
||||
query: string;
|
||||
limit: number;
|
||||
provider?: 'duckduckgo';
|
||||
filters?: {
|
||||
timeRange?: 'day' | 'week' | 'month' | 'year';
|
||||
location?: string;
|
||||
lang?: string;
|
||||
};
|
||||
mode: 'discover' | 'scrape';
|
||||
}) => {
|
||||
if (this.isAborted) {
|
||||
this.log('Workflow aborted, stopping search', Level.WARN);
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.options.debugChannel?.setActionType) {
|
||||
this.options.debugChannel.setActionType('search');
|
||||
}
|
||||
|
||||
searchConfig.provider = 'duckduckgo';
|
||||
|
||||
this.log(`Performing DuckDuckGo search for: ${searchConfig.query}`, Level.LOG);
|
||||
|
||||
try {
|
||||
let searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(searchConfig.query)}`;
|
||||
|
||||
if (searchConfig.filters?.timeRange) {
|
||||
const timeMap: Record<string, string> = {
|
||||
'day': 'd',
|
||||
'week': 'w',
|
||||
'month': 'm',
|
||||
'year': 'y'
|
||||
};
|
||||
searchUrl += `&df=${timeMap[searchConfig.filters.timeRange]}`;
|
||||
}
|
||||
|
||||
const initialDelay = 500 + Math.random() * 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, initialDelay));
|
||||
|
||||
await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||||
|
||||
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {
|
||||
this.log('Load state timeout, continuing anyway', Level.WARN);
|
||||
});
|
||||
|
||||
const pageLoadDelay = 2000 + Math.random() * 1500;
|
||||
await new Promise(resolve => setTimeout(resolve, pageLoadDelay));
|
||||
|
||||
let searchResults: any[] = [];
|
||||
let retryCount = 0;
|
||||
const maxRetries = 2;
|
||||
|
||||
while (searchResults.length === 0 && retryCount <= maxRetries) {
|
||||
if (retryCount > 0) {
|
||||
this.log(`Retry attempt ${retryCount}/${maxRetries} for DuckDuckGo search...`, Level.LOG);
|
||||
const retryDelay = 1000 * Math.pow(2, retryCount) + Math.random() * 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, retryDelay));
|
||||
}
|
||||
|
||||
this.log('Attempting to extract DuckDuckGo search results...', Level.LOG);
|
||||
|
||||
await page.waitForSelector('[data-testid="result"], .result', { timeout: 5000 }).catch(() => {
|
||||
this.log('DuckDuckGo results not found on initial wait', Level.WARN);
|
||||
});
|
||||
|
||||
let currentResultCount = 0;
|
||||
const maxLoadAttempts = Math.ceil(searchConfig.limit / 10) * 2;
|
||||
let loadAttempts = 0;
|
||||
let noNewResultsCount = 0;
|
||||
|
||||
while (currentResultCount < searchConfig.limit && loadAttempts < maxLoadAttempts && noNewResultsCount < 3) {
|
||||
const previousCount = currentResultCount;
|
||||
|
||||
currentResultCount = await page.evaluate(() => {
|
||||
const selectors = [
|
||||
'[data-testid="result"]',
|
||||
'article[data-testid="result"]',
|
||||
'li[data-layout="organic"]',
|
||||
'.result',
|
||||
'article[data-testid]'
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
if (elements.length > 0) {
|
||||
return elements.length;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
});
|
||||
|
||||
if (currentResultCount >= searchConfig.limit) {
|
||||
this.log(`Reached desired result count: ${currentResultCount}`, Level.LOG);
|
||||
break;
|
||||
}
|
||||
|
||||
if (currentResultCount === previousCount) {
|
||||
noNewResultsCount++;
|
||||
this.log(`No new results after load more (attempt ${noNewResultsCount}/3)`, Level.WARN);
|
||||
if (noNewResultsCount >= 3) break;
|
||||
} else {
|
||||
noNewResultsCount = 0;
|
||||
this.log(`Current results count: ${currentResultCount}/${searchConfig.limit}`, Level.LOG);
|
||||
}
|
||||
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo(0, document.body.scrollHeight);
|
||||
});
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 800));
|
||||
|
||||
const loadMoreClicked = await page.evaluate(() => {
|
||||
const selectors = [
|
||||
'#more-results',
|
||||
'button:has-text("More results")',
|
||||
'button:has-text("more results")',
|
||||
'button[id*="more"]',
|
||||
'button:has-text("Load more")'
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
try {
|
||||
const button = document.querySelector(selector) as HTMLButtonElement;
|
||||
if (button && button.offsetParent !== null) {
|
||||
button.click();
|
||||
console.log(`Clicked load more button with selector: ${selector}`);
|
||||
return true;
|
||||
}
|
||||
} catch (e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
if (loadMoreClicked) {
|
||||
this.log('Clicked "More results" button', Level.LOG);
|
||||
await new Promise(resolve => setTimeout(resolve, 1500 + Math.random() * 1000));
|
||||
} else {
|
||||
this.log('No "More results" button found, results may be limited', Level.WARN);
|
||||
break;
|
||||
}
|
||||
|
||||
loadAttempts++;
|
||||
}
|
||||
|
||||
this.log(`Finished pagination. Total results available: ${currentResultCount}`, Level.LOG);
|
||||
|
||||
searchResults = await page.evaluate((limit: number) => {
|
||||
const results: any[] = [];
|
||||
|
||||
const cleanDescription = (text: string): string => {
|
||||
if (!text) return '';
|
||||
let cleaned = text.replace(/^\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago\s*/i, '');
|
||||
cleaned = cleaned.replace(/^[A-Z][a-z]{2}\s+\d{1,2},?\s+\d{4}\s*[—\-]\s*/i, '');
|
||||
cleaned = cleaned.replace(/^\d{4}-\d{2}-\d{2}\s*[—\-]\s*/i, '');
|
||||
cleaned = cleaned.trim().replace(/\s+/g, ' ');
|
||||
return cleaned;
|
||||
};
|
||||
|
||||
const selectors = [
|
||||
'[data-testid="result"]',
|
||||
'article[data-testid="result"]',
|
||||
'li[data-layout="organic"]',
|
||||
'.result',
|
||||
'article[data-testid]'
|
||||
];
|
||||
let allElements: Element[] = [];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const elements = Array.from(document.querySelectorAll(selector));
|
||||
if (elements.length > 0) {
|
||||
console.log(`Found ${elements.length} DDG elements with: ${selector}`);
|
||||
allElements = elements;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (let i = 0; i < Math.min(allElements.length, limit); i++) {
|
||||
const element = allElements[i];
|
||||
|
||||
const titleEl = element.querySelector('h2, [data-testid="result-title-a"], h3, [data-testid="result-title"]');
|
||||
|
||||
let linkEl = titleEl?.querySelector('a[href]') as HTMLAnchorElement;
|
||||
if (!linkEl) {
|
||||
linkEl = element.querySelector('a[href]') as HTMLAnchorElement;
|
||||
}
|
||||
|
||||
if (!linkEl || !linkEl.href) continue;
|
||||
|
||||
let actualUrl = linkEl.href;
|
||||
|
||||
if (actualUrl.includes('uddg=')) {
|
||||
try {
|
||||
const urlParams = new URLSearchParams(actualUrl.split('?')[1]);
|
||||
const uddgUrl = urlParams.get('uddg');
|
||||
if (uddgUrl) {
|
||||
actualUrl = decodeURIComponent(uddgUrl);
|
||||
}
|
||||
} catch (e) {
|
||||
console.log('Failed to parse uddg parameter:', e);
|
||||
}
|
||||
}
|
||||
|
||||
if (actualUrl.includes('duckduckgo.com')) {
|
||||
console.log(`Skipping DDG internal URL: ${actualUrl}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const descEl = element.querySelector('[data-result="snippet"], .result__snippet, [data-testid="result-snippet"]');
|
||||
|
||||
if (titleEl && titleEl.textContent && actualUrl) {
|
||||
const rawDescription = (descEl?.textContent || '').trim();
|
||||
const cleanedDescription = cleanDescription(rawDescription);
|
||||
|
||||
results.push({
|
||||
url: actualUrl,
|
||||
title: titleEl.textContent.trim(),
|
||||
description: cleanedDescription,
|
||||
position: results.length + 1
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Extracted ${results.length} DuckDuckGo search results`);
|
||||
return results;
|
||||
}, searchConfig.limit);
|
||||
|
||||
if (searchResults.length === 0) {
|
||||
this.log(`No DuckDuckGo results found (attempt ${retryCount + 1}/${maxRetries + 1})`, Level.WARN);
|
||||
retryCount++;
|
||||
} else {
|
||||
this.log(`Successfully extracted ${searchResults.length} results`, Level.LOG);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
this.log(`Search found ${searchResults.length} results`, Level.LOG);
|
||||
|
||||
if (searchConfig.mode === 'discover') {
|
||||
const actionType = "search";
|
||||
const actionName = "Search Results";
|
||||
|
||||
if (!this.serializableDataByType[actionType]) {
|
||||
this.serializableDataByType[actionType] = {};
|
||||
}
|
||||
if (!this.serializableDataByType[actionType][actionName]) {
|
||||
this.serializableDataByType[actionType][actionName] = {};
|
||||
}
|
||||
|
||||
const searchData = {
|
||||
query: searchConfig.query,
|
||||
provider: searchConfig.provider,
|
||||
filters: searchConfig.filters || {},
|
||||
resultsCount: searchResults.length,
|
||||
results: searchResults,
|
||||
searchedAt: new Date().toISOString()
|
||||
};
|
||||
|
||||
this.serializableDataByType[actionType][actionName] = searchData;
|
||||
|
||||
await this.options.serializableCallback({
|
||||
scrapeList: this.serializableDataByType.scrapeList || {},
|
||||
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
|
||||
crawl: this.serializableDataByType.crawl || {},
|
||||
search: this.serializableDataByType.search || {}
|
||||
});
|
||||
|
||||
this.log(`Search completed in discover mode with ${searchResults.length} results`, Level.LOG);
|
||||
return;
|
||||
}
|
||||
|
||||
this.log(`Starting to scrape content from ${searchResults.length} search results...`, Level.LOG);
|
||||
const scrapedResults = [];
|
||||
|
||||
for (let i = 0; i < searchResults.length; i++) {
|
||||
const result = searchResults[i];
|
||||
try {
|
||||
this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, Level.LOG);
|
||||
|
||||
await page.goto(result.url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: 30000
|
||||
}).catch(() => {
|
||||
this.log(`Failed to navigate to ${result.url}, skipping...`, Level.WARN);
|
||||
});
|
||||
|
||||
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
|
||||
|
||||
const pageData = await page.evaluate(() => {
|
||||
const getMeta = (name: string) => {
|
||||
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
||||
return meta?.getAttribute('content') || '';
|
||||
};
|
||||
|
||||
const getAllMeta = () => {
|
||||
const metadata: Record<string, string> = {};
|
||||
const metaTags = document.querySelectorAll('meta');
|
||||
metaTags.forEach(tag => {
|
||||
const name = tag.getAttribute('name') || tag.getAttribute('property');
|
||||
const content = tag.getAttribute('content');
|
||||
if (name && content) {
|
||||
metadata[name] = content;
|
||||
}
|
||||
});
|
||||
return metadata;
|
||||
};
|
||||
|
||||
const title = document.title || '';
|
||||
const bodyText = document.body?.innerText || '';
|
||||
|
||||
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
|
||||
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
||||
|
||||
const html = document.documentElement.outerHTML;
|
||||
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
|
||||
const allMetadata = getAllMeta();
|
||||
|
||||
return {
|
||||
title,
|
||||
description: getMeta('description'),
|
||||
text: bodyText,
|
||||
html: html,
|
||||
links: links,
|
||||
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
|
||||
metadata: {
|
||||
...allMetadata,
|
||||
title,
|
||||
language: document.documentElement.lang || '',
|
||||
favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '',
|
||||
statusCode: 200
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
scrapedResults.push({
|
||||
searchResult: {
|
||||
query: searchConfig.query,
|
||||
position: result.position,
|
||||
searchTitle: result.title,
|
||||
searchDescription: result.description,
|
||||
},
|
||||
metadata: {
|
||||
...pageData.metadata,
|
||||
url: result.url,
|
||||
sourceURL: result.url
|
||||
},
|
||||
html: pageData.html,
|
||||
text: pageData.text,
|
||||
links: pageData.links,
|
||||
wordCount: pageData.wordCount,
|
||||
scrapedAt: new Date().toISOString()
|
||||
});
|
||||
|
||||
this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, Level.LOG);
|
||||
|
||||
} catch (error) {
|
||||
this.log(`Failed to scrape ${result.url}: ${error.message}`, Level.WARN);
|
||||
scrapedResults.push({
|
||||
searchResult: {
|
||||
query: searchConfig.query,
|
||||
position: result.position,
|
||||
searchTitle: result.title,
|
||||
searchDescription: result.description,
|
||||
},
|
||||
url: result.url,
|
||||
error: error.message,
|
||||
scrapedAt: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
this.log(`Successfully scraped ${scrapedResults.length} search results`, Level.LOG);
|
||||
|
||||
const actionType = "search";
|
||||
const actionName = "Search Results";
|
||||
|
||||
if (!this.serializableDataByType[actionType]) {
|
||||
this.serializableDataByType[actionType] = {};
|
||||
}
|
||||
if (!this.serializableDataByType[actionType][actionName]) {
|
||||
this.serializableDataByType[actionType][actionName] = {};
|
||||
}
|
||||
|
||||
const searchData = {
|
||||
query: searchConfig.query,
|
||||
provider: searchConfig.provider,
|
||||
filters: searchConfig.filters || {},
|
||||
mode: searchConfig.mode,
|
||||
resultsCount: scrapedResults.length,
|
||||
results: scrapedResults,
|
||||
searchedAt: new Date().toISOString()
|
||||
};
|
||||
|
||||
this.serializableDataByType[actionType][actionName] = searchData;
|
||||
|
||||
await this.options.serializableCallback({
|
||||
scrapeList: this.serializableDataByType.scrapeList || {},
|
||||
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
|
||||
crawl: this.serializableDataByType.crawl || {},
|
||||
search: this.serializableDataByType.search || {}
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
this.log(`Search action failed: ${error.message}`, Level.ERROR);
|
||||
throw new Error(`Search execution error: ${error.message}`);
|
||||
}
|
||||
},
|
||||
|
||||
flag: async () => new Promise((res) => {
|
||||
if (this.options.debugChannel?.setActionType) {
|
||||
this.options.debugChannel.setActionType('flag');
|
||||
@@ -890,7 +1638,9 @@ export default class Interpreter extends EventEmitter {
|
||||
this.serializableDataByType[actionType][actionName] = [...allResults];
|
||||
await this.options.serializableCallback({
|
||||
scrapeList: this.serializableDataByType.scrapeList,
|
||||
scrapeSchema: this.serializableDataByType.scrapeSchema
|
||||
scrapeSchema: this.serializableDataByType.scrapeSchema,
|
||||
crawl: this.serializableDataByType.crawl || {},
|
||||
search: this.serializableDataByType.search || {}
|
||||
});
|
||||
};
|
||||
|
||||
@@ -1758,7 +2508,7 @@ export default class Interpreter extends EventEmitter {
|
||||
// Clear accumulated data to free memory
|
||||
this.cumulativeResults = [];
|
||||
this.namedResults = {};
|
||||
this.serializableDataByType = { scrapeList: {}, scrapeSchema: {} };
|
||||
this.serializableDataByType = { scrapeList: {}, scrapeSchema: {}, crawl: {}, search: {} };
|
||||
|
||||
// Reset state
|
||||
this.isAborted = false;
|
||||
|
||||
@@ -28,7 +28,7 @@ type MethodNames<T> = {
|
||||
[K in keyof T]: T[K] extends Function ? K : never;
|
||||
}[keyof T];
|
||||
|
||||
export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto';
|
||||
export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto' | 'crawl' | 'search';
|
||||
|
||||
export type What = {
|
||||
action: MethodNames<Page> | CustomFunctions,
|
||||
|
||||
@@ -13,8 +13,8 @@ import { AuthenticatedRequest } from "../routes/record"
|
||||
import {capture} from "../utils/analytics";
|
||||
import { Page } from "playwright-core";
|
||||
import { WorkflowFile } from "maxun-core";
|
||||
import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
|
||||
import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
|
||||
import { addGoogleSheetUpdateTask, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
|
||||
import { addAirtableUpdateTask, processAirtableUpdates } from "../workflow-management/integrations/airtable";
|
||||
import { sendWebhook } from "../routes/webhook";
|
||||
import { convertPageToHTML, convertPageToMarkdown, convertPageToScreenshot } from '../markdownify/scrape';
|
||||
|
||||
@@ -309,8 +309,8 @@ router.get("/robots/:id/runs",requireAPIKey, async (req: Request, res: Response)
|
||||
statusCode: 200,
|
||||
messageCode: "success",
|
||||
runs: {
|
||||
totalCount: formattedRuns.length,
|
||||
items: formattedRuns,
|
||||
totalCount: formattedRuns.length,
|
||||
items: formattedRuns,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -342,6 +342,8 @@ function formatRunResponse(run: any) {
|
||||
data: {
|
||||
textData: {},
|
||||
listData: {},
|
||||
crawlData: {},
|
||||
searchData: {},
|
||||
markdown: '',
|
||||
html: ''
|
||||
},
|
||||
@@ -358,6 +360,14 @@ function formatRunResponse(run: any) {
|
||||
formattedRun.data.listData = output.scrapeList;
|
||||
}
|
||||
|
||||
if (output.crawl && typeof output.crawl === 'object') {
|
||||
formattedRun.data.crawlData = output.crawl;
|
||||
}
|
||||
|
||||
if (output.search && typeof output.search === 'object') {
|
||||
formattedRun.data.searchData = output.search;
|
||||
}
|
||||
|
||||
if (output.markdown && Array.isArray(output.markdown)) {
|
||||
formattedRun.data.markdown = output.markdown[0]?.content || '';
|
||||
}
|
||||
@@ -466,7 +476,7 @@ router.get("/robots/:id/runs/:runId", requireAPIKey, async (req: Request, res: R
|
||||
}
|
||||
});
|
||||
|
||||
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
|
||||
async function createWorkflowAndStoreMetadata(id: string, userId: string, isSDK: boolean) {
|
||||
try {
|
||||
const recording = await Robot.findOne({
|
||||
where: {
|
||||
@@ -510,7 +520,9 @@ async function createWorkflowAndStoreMetadata(id: string, userId: string) {
|
||||
interpreterSettings: { maxConcurrency: 1, maxRepeats: 1, debug: true },
|
||||
log: '',
|
||||
runId,
|
||||
runByAPI: true,
|
||||
runByUserId: userId,
|
||||
runByAPI: !isSDK,
|
||||
runBySDK: isSDK,
|
||||
serializableOutput: {},
|
||||
binaryOutput: {},
|
||||
retryCount: 0
|
||||
@@ -687,7 +699,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
|
||||
let formats = recording.recording_meta.formats || ['markdown'];
|
||||
|
||||
// Override if API request defines formats
|
||||
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
|
||||
formats = requestedFormats.filter((f): f is 'markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage' =>
|
||||
['markdown', 'html', 'screenshot-visible', 'screenshot-fullpage'].includes(f)
|
||||
@@ -714,50 +725,70 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
const SCRAPE_TIMEOUT = 120000;
|
||||
|
||||
if (formats.includes('markdown')) {
|
||||
const markdownPromise = convertPageToMarkdown(url, currentPage);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
markdown = await Promise.race([markdownPromise, timeoutPromise]);
|
||||
serializableOutput.markdown = [{ content: markdown }];
|
||||
try {
|
||||
const markdownPromise = convertPageToMarkdown(url, currentPage);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
markdown = await Promise.race([markdownPromise, timeoutPromise]);
|
||||
if (markdown && markdown.trim().length > 0) {
|
||||
serializableOutput.markdown = [{ content: markdown }];
|
||||
}
|
||||
} catch (error: any) {
|
||||
logger.log('warn', `Markdown conversion failed for API run ${plainRun.runId}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (formats.includes('html')) {
|
||||
const htmlPromise = convertPageToHTML(url, currentPage);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
html = await Promise.race([htmlPromise, timeoutPromise]);
|
||||
serializableOutput.html = [{ content: html }];
|
||||
try {
|
||||
const htmlPromise = convertPageToHTML(url, currentPage);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
html = await Promise.race([htmlPromise, timeoutPromise]);
|
||||
if (html && html.trim().length > 0) {
|
||||
serializableOutput.html = [{ content: html }];
|
||||
}
|
||||
} catch (error: any) {
|
||||
logger.log('warn', `HTML conversion failed for API run ${plainRun.runId}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (formats.includes("screenshot-visible")) {
|
||||
const screenshotPromise = convertPageToScreenshot(url, currentPage, false);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
|
||||
try {
|
||||
const screenshotPromise = convertPageToScreenshot(url, currentPage, false);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
|
||||
|
||||
if (!binaryOutput['screenshot-visible']) {
|
||||
binaryOutput['screenshot-visible'] = {
|
||||
data: screenshotBuffer.toString('base64'),
|
||||
mimeType: 'image/png'
|
||||
};
|
||||
if (screenshotBuffer && screenshotBuffer.length > 0) {
|
||||
binaryOutput['screenshot-visible'] = {
|
||||
data: screenshotBuffer.toString('base64'),
|
||||
mimeType: 'image/png'
|
||||
};
|
||||
}
|
||||
} catch (error: any) {
|
||||
logger.log('warn', `Screenshot-visible conversion failed for API run ${plainRun.runId}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (formats.includes("screenshot-fullpage")) {
|
||||
const screenshotPromise = convertPageToScreenshot(url, currentPage, true);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
|
||||
try {
|
||||
const screenshotPromise = convertPageToScreenshot(url, currentPage, true);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
|
||||
|
||||
if (!binaryOutput['screenshot-fullpage']) {
|
||||
binaryOutput['screenshot-fullpage'] = {
|
||||
data: screenshotBuffer.toString('base64'),
|
||||
mimeType: 'image/png'
|
||||
};
|
||||
if (screenshotBuffer && screenshotBuffer.length > 0) {
|
||||
binaryOutput['screenshot-fullpage'] = {
|
||||
data: screenshotBuffer.toString('base64'),
|
||||
mimeType: 'image/png'
|
||||
};
|
||||
}
|
||||
} catch (error: any) {
|
||||
logger.log('warn', `Screenshot-fullpage conversion failed for API run ${plainRun.runId}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -769,7 +800,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
binaryOutput,
|
||||
});
|
||||
|
||||
// Upload binary output (screenshots) to MinIO if present
|
||||
let uploadedBinaryOutput: Record<string, string> = {};
|
||||
if (Object.keys(binaryOutput).length > 0) {
|
||||
const binaryOutputService = new BinaryOutputService('maxun-run-screenshots');
|
||||
@@ -779,7 +809,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
|
||||
logger.log('info', `Markdown robot execution completed for API run ${id}`);
|
||||
|
||||
// Push success socket event
|
||||
try {
|
||||
const completionData = {
|
||||
runId: plainRun.runId,
|
||||
@@ -800,7 +829,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
);
|
||||
}
|
||||
|
||||
// Build webhook payload
|
||||
const webhookPayload: any = {
|
||||
robot_id: plainRun.robotMetaId,
|
||||
run_id: plainRun.runId,
|
||||
@@ -814,8 +842,8 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
},
|
||||
};
|
||||
|
||||
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
|
||||
if (formats.includes('html')) webhookPayload.html = html;
|
||||
if (serializableOutput.markdown) webhookPayload.markdown = markdown;
|
||||
if (serializableOutput.html) webhookPayload.html = html;
|
||||
if (uploadedBinaryOutput['screenshot-visible']) webhookPayload.screenshot_visible = uploadedBinaryOutput['screenshot-visible'];
|
||||
if (uploadedBinaryOutput['screenshot-fullpage']) webhookPayload.screenshot_fullpage = uploadedBinaryOutput['screenshot-fullpage'];
|
||||
|
||||
@@ -834,9 +862,12 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
|
||||
capture("maxun-oss-run-created-api", {
|
||||
runId: plainRun.runId,
|
||||
user_id: userId,
|
||||
userId: userId,
|
||||
robotId: recording.recording_meta.id,
|
||||
robotType: "scrape",
|
||||
source: "api",
|
||||
status: "success",
|
||||
robot_type: "scrape",
|
||||
createdAt: new Date().toISOString(),
|
||||
formats
|
||||
});
|
||||
|
||||
@@ -858,14 +889,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
log: `${formats.join(', ')} conversion failed: ${error.message}`,
|
||||
});
|
||||
|
||||
// Send failure socket event
|
||||
try {
|
||||
const failureData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
error: error.message
|
||||
};
|
||||
|
||||
serverIo
|
||||
@@ -895,11 +926,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
logger.log('warn', `Failed to send webhook for failed API scrape run ${plainRun.runId}: ${webhookError.message}`);
|
||||
}
|
||||
|
||||
capture("maxun-oss-run-created-api", {
|
||||
capture("maxun-oss-run-created", {
|
||||
runId: plainRun.runId,
|
||||
user_id: userId,
|
||||
userId: userId,
|
||||
robotId: recording.recording_meta.id,
|
||||
robotType: "scrape",
|
||||
source: "api",
|
||||
status: "failed",
|
||||
robot_type: "scrape",
|
||||
createdAt: new Date().toISOString(),
|
||||
formats
|
||||
});
|
||||
|
||||
@@ -993,15 +1027,18 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
|
||||
const totalRowsExtracted = totalSchemaItemsExtracted + totalListItemsExtracted;
|
||||
|
||||
capture('maxun-oss-run-created-api',{
|
||||
capture('maxun-oss-run-created',{
|
||||
runId: id,
|
||||
created_at: new Date().toISOString(),
|
||||
userId: userId,
|
||||
robotId: recording.recording_meta.id,
|
||||
robotType: recording.recording_meta.type || 'extract',
|
||||
source: 'api',
|
||||
createdAt: new Date().toISOString(),
|
||||
status: 'success',
|
||||
totalRowsExtracted,
|
||||
schemaItemsExtracted: totalSchemaItemsExtracted,
|
||||
listItemsExtracted: totalListItemsExtracted,
|
||||
totalSchemaItemsExtracted,
|
||||
totalListItemsExtracted,
|
||||
extractedScreenshotsCount,
|
||||
is_llm: (recording.recording_meta as any).isLLM,
|
||||
totalRowsExtracted
|
||||
}
|
||||
)
|
||||
|
||||
@@ -1019,6 +1056,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
typeof parsedOutput.scrapeSchema === "string"
|
||||
? JSON.parse(parsedOutput.scrapeSchema)
|
||||
: parsedOutput.scrapeSchema || {};
|
||||
|
||||
const parsedCrawl =
|
||||
typeof parsedOutput.crawl === "string"
|
||||
? JSON.parse(parsedOutput.crawl)
|
||||
: parsedOutput.crawl || {};
|
||||
|
||||
const parsedSearch =
|
||||
typeof parsedOutput.search === "string"
|
||||
? JSON.parse(parsedOutput.search)
|
||||
: parsedOutput.search || {};
|
||||
|
||||
const webhookPayload = {
|
||||
robot_id: plainRun.robotMetaId,
|
||||
@@ -1030,6 +1077,8 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
extracted_data: {
|
||||
captured_texts: parsedSchema || {},
|
||||
captured_lists: parsedList || {},
|
||||
crawl_data: parsedCrawl || {},
|
||||
search_data: parsedSearch || {},
|
||||
captured_texts_count: totalSchemaItemsExtracted,
|
||||
captured_lists_count: totalListItemsExtracted,
|
||||
screenshots_count: extractedScreenshotsCount
|
||||
@@ -1097,7 +1146,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
|
||||
const recording = await Robot.findOne({ where: { 'recording_meta.id': run.robotMetaId }, raw: true });
|
||||
|
||||
// Trigger webhooks for run failure
|
||||
const failedWebhookPayload = {
|
||||
robot_id: run.robotMetaId,
|
||||
run_id: run.runId,
|
||||
@@ -1123,10 +1171,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
logger.log('error', `Failed to send failure webhooks for run ${run.runId}: ${webhookError.message}`);
|
||||
}
|
||||
capture(
|
||||
'maxun-oss-run-created-api',
|
||||
'maxun-oss-run-created',
|
||||
{
|
||||
runId: id,
|
||||
created_at: new Date().toISOString(),
|
||||
userId: userId,
|
||||
robotId: recording?.recording_meta?.id || run.robotMetaId,
|
||||
robotType: recording?.recording_meta?.type || 'extract',
|
||||
source: 'api',
|
||||
createdAt: new Date().toISOString(),
|
||||
status: 'failed',
|
||||
is_llm: (recording?.recording_meta as any)?.isLLM,
|
||||
}
|
||||
@@ -1139,11 +1191,11 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
}
|
||||
}
|
||||
|
||||
export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
|
||||
export async function handleRunRecording(id: string, userId: string, isSDK: boolean = false) {
|
||||
let socket: Socket | null = null;
|
||||
|
||||
try {
|
||||
const result = await createWorkflowAndStoreMetadata(id, userId);
|
||||
const result = await createWorkflowAndStoreMetadata(id, userId, isSDK);
|
||||
const { browserId, runId: newRunId } = result;
|
||||
|
||||
if (!browserId || !newRunId || !userId) {
|
||||
@@ -1167,6 +1219,10 @@ export async function handleRunRecording(id: string, userId: string, requestedFo
|
||||
cleanupSocketConnection(socket!, browserId, newRunId);
|
||||
});
|
||||
|
||||
socket.on('error', (error: Error) => {
|
||||
logger.error(`Socket error for API run ${newRunId}: ${error.message}`);
|
||||
});
|
||||
|
||||
socket.on('disconnect', () => {
|
||||
cleanupSocketConnection(socket!, browserId, newRunId);
|
||||
});
|
||||
@@ -1318,9 +1374,7 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
|
||||
return res.status(401).json({ ok: false, error: 'Unauthorized' });
|
||||
}
|
||||
|
||||
const requestedFormats = req.body.formats;
|
||||
|
||||
const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);
|
||||
const runId = await handleRunRecording(req.params.id, req.user.id);
|
||||
|
||||
if (!runId) {
|
||||
throw new Error('Run ID is undefined');
|
||||
|
||||
@@ -455,13 +455,35 @@ router.post("/sdk/robots/:id/execute", requireAPIKey, async (req: AuthenticatedR
|
||||
}
|
||||
}
|
||||
|
||||
let crawlData: any[] = [];
|
||||
if (run.serializableOutput?.crawl) {
|
||||
const crawl: any = run.serializableOutput.crawl;
|
||||
|
||||
if (Array.isArray(crawl)) {
|
||||
crawlData = crawl;
|
||||
}
|
||||
else if (typeof crawl === 'object') {
|
||||
const crawlValues = Object.values(crawl);
|
||||
if (crawlValues.length > 0 && Array.isArray(crawlValues[0])) {
|
||||
crawlData = crawlValues[0] as any[];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let searchData: any = {};
|
||||
if (run.serializableOutput?.search) {
|
||||
searchData = run.serializableOutput.search;
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
data: {
|
||||
runId: run.runId,
|
||||
status: run.status,
|
||||
data: {
|
||||
textData: run.serializableOutput?.scrapeSchema || {},
|
||||
listData: listData
|
||||
listData: listData,
|
||||
crawlData: crawlData,
|
||||
searchData: searchData
|
||||
},
|
||||
screenshots: Object.values(run.binaryOutput || {})
|
||||
}
|
||||
@@ -645,6 +667,202 @@ router.post("/sdk/robots/:id/runs/:runId/abort", requireAPIKey, async (req: Auth
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* Create a crawl robot programmatically
|
||||
* POST /api/sdk/crawl
|
||||
*/
|
||||
router.post("/sdk/crawl", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
|
||||
try {
|
||||
const user = req.user;
|
||||
const { url, name, crawlConfig } = req.body;
|
||||
|
||||
if (!url || !crawlConfig) {
|
||||
return res.status(400).json({
|
||||
error: "URL and crawl configuration are required"
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
new URL(url);
|
||||
} catch (err) {
|
||||
return res.status(400).json({
|
||||
error: "Invalid URL format"
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof crawlConfig !== 'object') {
|
||||
return res.status(400).json({
|
||||
error: "crawlConfig must be an object"
|
||||
});
|
||||
}
|
||||
|
||||
const robotName = name || `Crawl Robot - ${new URL(url).hostname}`;
|
||||
const robotId = uuid();
|
||||
const metaId = uuid();
|
||||
|
||||
const robot = await Robot.create({
|
||||
id: robotId,
|
||||
userId: user.id,
|
||||
recording_meta: {
|
||||
name: robotName,
|
||||
id: metaId,
|
||||
createdAt: new Date().toISOString(),
|
||||
updatedAt: new Date().toISOString(),
|
||||
pairs: 1,
|
||||
params: [],
|
||||
type: 'crawl',
|
||||
url: url,
|
||||
},
|
||||
recording: {
|
||||
workflow: [
|
||||
{
|
||||
where: { url },
|
||||
what: [
|
||||
{ action: 'flag', args: ['generated'] },
|
||||
{
|
||||
action: 'crawl',
|
||||
args: [crawlConfig],
|
||||
name: 'Crawl'
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
where: { url: 'about:blank' },
|
||||
what: [
|
||||
{
|
||||
action: 'goto',
|
||||
args: [url]
|
||||
},
|
||||
{
|
||||
action: 'waitForLoadState',
|
||||
args: ['networkidle']
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
logger.info(`[SDK] Crawl robot created: ${metaId} (db: ${robotId}) by user ${user.id}`);
|
||||
|
||||
capture("maxun-oss-robot-created", {
|
||||
userId: user.id.toString(),
|
||||
robotId: metaId,
|
||||
robotName: robotName,
|
||||
url: url,
|
||||
robotType: 'crawl',
|
||||
crawlConfig: crawlConfig,
|
||||
source: 'sdk'
|
||||
});
|
||||
|
||||
return res.status(201).json({
|
||||
data: robot,
|
||||
message: "Crawl robot created successfully"
|
||||
});
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error("[SDK] Error creating crawl robot:", error);
|
||||
return res.status(500).json({
|
||||
error: "Failed to create crawl robot",
|
||||
message: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* Create a search robot programmatically
|
||||
* POST /api/sdk/search
|
||||
*/
|
||||
router.post("/sdk/search", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
|
||||
try {
|
||||
const user = req.user;
|
||||
const { name, searchConfig } = req.body;
|
||||
|
||||
if (!searchConfig) {
|
||||
return res.status(400).json({
|
||||
error: "Search configuration is required"
|
||||
});
|
||||
}
|
||||
|
||||
if (!searchConfig.query) {
|
||||
return res.status(400).json({
|
||||
error: "searchConfig must include a query"
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof searchConfig !== 'object') {
|
||||
return res.status(400).json({
|
||||
error: "searchConfig must be an object"
|
||||
});
|
||||
}
|
||||
|
||||
if (searchConfig.mode && !['discover', 'scrape'].includes(searchConfig.mode)) {
|
||||
return res.status(400).json({
|
||||
error: "searchConfig.mode must be either 'discover' or 'scrape'"
|
||||
});
|
||||
}
|
||||
|
||||
searchConfig.provider = 'duckduckgo';
|
||||
|
||||
const robotName = name || `Search Robot - ${searchConfig.query}`;
|
||||
const robotId = uuid();
|
||||
const metaId = uuid();
|
||||
|
||||
const robot = await Robot.create({
|
||||
id: robotId,
|
||||
userId: user.id,
|
||||
recording_meta: {
|
||||
name: robotName,
|
||||
id: metaId,
|
||||
createdAt: new Date().toISOString(),
|
||||
updatedAt: new Date().toISOString(),
|
||||
pairs: 1,
|
||||
params: [],
|
||||
type: 'search',
|
||||
},
|
||||
recording: {
|
||||
workflow: [
|
||||
{
|
||||
where: { url: 'about:blank' },
|
||||
what: [
|
||||
{
|
||||
action: 'search',
|
||||
args: [searchConfig],
|
||||
name: 'Search'
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
logger.info(`[SDK] Search robot created: ${metaId} (db: ${robotId}) by user ${user.id}`);
|
||||
|
||||
capture("maxun-oss-robot-created", {
|
||||
userId: user.id.toString(),
|
||||
robotId: metaId,
|
||||
robotName: robotName,
|
||||
robotType: 'search',
|
||||
searchQuery: searchConfig.query,
|
||||
searchProvider: searchConfig.provider || 'duckduckgo',
|
||||
searchLimit: searchConfig.limit || 10,
|
||||
source: 'sdk'
|
||||
});
|
||||
|
||||
return res.status(201).json({
|
||||
data: robot,
|
||||
message: "Search robot created successfully"
|
||||
});
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error("[SDK] Error creating search robot:", error);
|
||||
return res.status(500).json({
|
||||
error: "Failed to create search robot",
|
||||
message: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* LLM-based extraction - generate workflow from natural language prompt
|
||||
* POST /api/sdk/extract/llm
|
||||
|
||||
@@ -9,7 +9,7 @@ interface RobotMeta {
|
||||
pairs: number;
|
||||
updatedAt: string;
|
||||
params: any[];
|
||||
type?: 'extract' | 'scrape';
|
||||
type?: 'extract' | 'scrape' | 'crawl' | 'search';
|
||||
url?: string;
|
||||
formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[];
|
||||
isLLM?: boolean;
|
||||
|
||||
@@ -23,6 +23,7 @@ interface RunAttributes {
|
||||
runByUserId?: string;
|
||||
runByScheduleId?: string;
|
||||
runByAPI?: boolean;
|
||||
runBySDK?: boolean;
|
||||
serializableOutput: Record<string, any>;
|
||||
binaryOutput: Record<string, string>;
|
||||
retryCount?: number;
|
||||
|
||||
@@ -132,7 +132,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
logger.log('info', `Processing run execution job for runId: ${data.runId}, browserId: ${data.browserId}`);
|
||||
|
||||
try {
|
||||
// Find the run
|
||||
const run = await Run.findOne({ where: { runId: data.runId } });
|
||||
if (!run) {
|
||||
logger.log('error', `Run ${data.runId} not found in database`);
|
||||
@@ -193,7 +192,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
logger.log('info', `Browser ${browserId} found and ready for execution`);
|
||||
|
||||
try {
|
||||
// Find the recording
|
||||
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
|
||||
|
||||
if (!recording) {
|
||||
@@ -473,11 +471,12 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
interpretationInfo.binaryOutput
|
||||
);
|
||||
|
||||
// Get the already persisted and credit-validated data from the run record
|
||||
const finalRun = await Run.findByPk(run.id);
|
||||
const categorizedOutput = {
|
||||
scrapeSchema: finalRun?.serializableOutput?.scrapeSchema || {},
|
||||
scrapeList: finalRun?.serializableOutput?.scrapeList || {}
|
||||
scrapeList: finalRun?.serializableOutput?.scrapeList || {},
|
||||
crawl: finalRun?.serializableOutput?.crawl || {},
|
||||
search: finalRun?.serializableOutput?.search || {}
|
||||
};
|
||||
|
||||
if (await isRunAborted()) {
|
||||
@@ -489,10 +488,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: interpretationInfo.log.join('\n'),
|
||||
serializableOutput: JSON.parse(JSON.stringify({
|
||||
scrapeSchema: categorizedOutput.scrapeSchema || {},
|
||||
scrapeList: categorizedOutput.scrapeList || {},
|
||||
})),
|
||||
binaryOutput: uploadedBinaryOutput,
|
||||
});
|
||||
|
||||
@@ -572,6 +567,8 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
}, {} as Record<string, any[]>)
|
||||
: {},
|
||||
captured_lists: categorizedOutput.scrapeList,
|
||||
crawl_data: categorizedOutput.crawl,
|
||||
search_data: categorizedOutput.search,
|
||||
captured_texts_count: totalSchemaItemsExtracted,
|
||||
captured_lists_count: totalListItemsExtracted,
|
||||
screenshots_count: extractedScreenshotsCount
|
||||
|
||||
@@ -251,21 +251,18 @@ function handleWorkflowActions(workflow: any[], credentials: Credentials) {
|
||||
router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { name, limits, credentials, targetUrl, workflow: incomingWorkflow } = req.body;
|
||||
const { name, limits, credentials, targetUrl, workflow: incomingWorkflow } = req.body;
|
||||
|
||||
// Validate input
|
||||
if (!name && !limits && !credentials && !targetUrl) {
|
||||
if (!name && !limits && !credentials && !targetUrl && !incomingWorkflow) {
|
||||
return res.status(400).json({ error: 'Either "name", "limits", "credentials" or "target_url" must be provided.' });
|
||||
}
|
||||
|
||||
// Fetch the robot by ID
|
||||
const robot = await Robot.findOne({ where: { 'recording_meta.id': id } });
|
||||
|
||||
if (!robot) {
|
||||
return res.status(404).json({ error: 'Robot not found.' });
|
||||
}
|
||||
|
||||
// Update fields if provided
|
||||
|
||||
if (name) {
|
||||
robot.set('recording_meta', { ...robot.recording_meta, name });
|
||||
}
|
||||
@@ -274,7 +271,6 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl });
|
||||
|
||||
const updatedWorkflow = [...robot.recording.workflow];
|
||||
let foundGoto = false;
|
||||
|
||||
for (let i = updatedWorkflow.length - 1; i >= 0; i--) {
|
||||
const step = updatedWorkflow[i];
|
||||
@@ -289,7 +285,6 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
|
||||
robot.set('recording', { ...robot.recording, workflow: updatedWorkflow });
|
||||
robot.changed('recording', true);
|
||||
foundGoto = true;
|
||||
i = -1;
|
||||
break;
|
||||
}
|
||||
@@ -299,10 +294,9 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
|
||||
await robot.save();
|
||||
|
||||
// Start with existing workflow or allow client to supply a full workflow replacement
|
||||
let workflow = incomingWorkflow && Array.isArray(incomingWorkflow)
|
||||
? JSON.parse(JSON.stringify(incomingWorkflow))
|
||||
: [...robot.recording.workflow]; // Create a copy of the workflow
|
||||
: [...robot.recording.workflow];
|
||||
|
||||
if (credentials) {
|
||||
workflow = handleWorkflowActions(workflow, credentials);
|
||||
@@ -344,7 +338,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
where: { 'recording_meta.id': id }
|
||||
});
|
||||
|
||||
const updatedRobot = await Robot.findOne({ where: { 'recording_meta.id': id } });
|
||||
await Robot.findOne({ where: { 'recording_meta.id': id } });
|
||||
|
||||
logger.log('info', `Robot with ID ${id} was updated successfully.`);
|
||||
|
||||
@@ -1323,4 +1317,198 @@ export async function recoverOrphanedRuns() {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* POST endpoint for creating a crawl robot
|
||||
* @route POST /recordings/crawl
|
||||
* @auth requireSignIn - JWT authentication required
|
||||
*/
|
||||
router.post('/recordings/crawl', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const { url, name, crawlConfig } = req.body;
|
||||
|
||||
if (!url || !crawlConfig) {
|
||||
return res.status(400).json({ error: 'URL and crawl configuration are required.' });
|
||||
}
|
||||
|
||||
if (!req.user) {
|
||||
return res.status(401).send({ error: 'Unauthorized' });
|
||||
}
|
||||
|
||||
try {
|
||||
new URL(url);
|
||||
} catch (err) {
|
||||
return res.status(400).json({ error: 'Invalid URL format' });
|
||||
}
|
||||
|
||||
const robotName = name || `Crawl Robot - ${new URL(url).hostname}`;
|
||||
const currentTimestamp = new Date().toLocaleString('en-US');
|
||||
const robotId = uuid();
|
||||
|
||||
const newRobot = await Robot.create({
|
||||
id: uuid(),
|
||||
userId: req.user.id,
|
||||
recording_meta: {
|
||||
name: robotName,
|
||||
id: robotId,
|
||||
createdAt: currentTimestamp,
|
||||
updatedAt: currentTimestamp,
|
||||
pairs: 1,
|
||||
params: [],
|
||||
type: 'crawl',
|
||||
url: url,
|
||||
},
|
||||
recording: {
|
||||
workflow: [
|
||||
{
|
||||
where: { url },
|
||||
what: [
|
||||
{ action: 'flag', args: ['generated'] },
|
||||
{
|
||||
action: 'crawl',
|
||||
args: [crawlConfig],
|
||||
name: 'Crawl'
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
where: { url: 'about:blank' },
|
||||
what: [
|
||||
{
|
||||
action: 'goto',
|
||||
args: [url]
|
||||
},
|
||||
{
|
||||
action: 'waitForLoadState',
|
||||
args: ['networkidle']
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
google_sheet_email: null,
|
||||
google_sheet_name: null,
|
||||
google_sheet_id: null,
|
||||
google_access_token: null,
|
||||
google_refresh_token: null,
|
||||
airtable_base_id: null,
|
||||
airtable_base_name: null,
|
||||
airtable_table_name: null,
|
||||
airtable_table_id: null,
|
||||
airtable_access_token: null,
|
||||
airtable_refresh_token: null,
|
||||
schedule: null,
|
||||
webhooks: null
|
||||
});
|
||||
|
||||
logger.log('info', `Crawl robot created with id: ${newRobot.id}`);
|
||||
capture('maxun-oss-robot-created', {
|
||||
userId: req.user.id.toString(),
|
||||
robotId: robotId,
|
||||
robotName: robotName,
|
||||
url: url,
|
||||
robotType: 'crawl',
|
||||
crawlConfig: crawlConfig
|
||||
});
|
||||
|
||||
return res.status(201).json({
|
||||
message: 'Crawl robot created successfully.',
|
||||
robot: newRobot,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
logger.log('error', `Error creating crawl robot: ${error.message}`);
|
||||
return res.status(500).json({ error: error.message });
|
||||
} else {
|
||||
logger.log('error', 'Unknown error creating crawl robot');
|
||||
return res.status(500).json({ error: 'An unknown error occurred.' });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST endpoint for creating a search robot
|
||||
* @route POST /recordings/search
|
||||
* @auth requireSignIn - JWT authentication required
|
||||
*/
|
||||
router.post('/recordings/search', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const { searchConfig, name } = req.body;
|
||||
|
||||
if (!searchConfig || !searchConfig.query) {
|
||||
return res.status(400).json({ error: 'Search configuration with query is required.' });
|
||||
}
|
||||
|
||||
if (!req.user) {
|
||||
return res.status(401).send({ error: 'Unauthorized' });
|
||||
}
|
||||
|
||||
const robotName = name || `Search Robot - ${searchConfig.query.substring(0, 50)}`;
|
||||
const currentTimestamp = new Date().toLocaleString('en-US');
|
||||
const robotId = uuid();
|
||||
|
||||
const newRobot = await Robot.create({
|
||||
id: uuid(),
|
||||
userId: req.user.id,
|
||||
recording_meta: {
|
||||
name: robotName,
|
||||
id: robotId,
|
||||
createdAt: currentTimestamp,
|
||||
updatedAt: currentTimestamp,
|
||||
pairs: 1,
|
||||
params: [],
|
||||
type: 'search',
|
||||
},
|
||||
recording: {
|
||||
workflow: [
|
||||
{
|
||||
where: { url: 'about:blank' },
|
||||
what: [{
|
||||
action: 'search',
|
||||
args: [searchConfig],
|
||||
name: 'Search'
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
google_sheet_email: null,
|
||||
google_sheet_name: null,
|
||||
google_sheet_id: null,
|
||||
google_access_token: null,
|
||||
google_refresh_token: null,
|
||||
airtable_base_id: null,
|
||||
airtable_base_name: null,
|
||||
airtable_table_name: null,
|
||||
airtable_table_id: null,
|
||||
airtable_access_token: null,
|
||||
airtable_refresh_token: null,
|
||||
schedule: null,
|
||||
webhooks: null
|
||||
});
|
||||
|
||||
logger.log('info', `Search robot created with id: ${newRobot.id}`);
|
||||
capture('maxun-oss-robot-created', {
|
||||
userId: req.user.id.toString(),
|
||||
robotId: robotId,
|
||||
robotName: robotName,
|
||||
robotType: 'search',
|
||||
searchQuery: searchConfig.query,
|
||||
searchProvider: searchConfig.provider || 'duckduckgo',
|
||||
searchLimit: searchConfig.limit || 10
|
||||
});
|
||||
|
||||
return res.status(201).json({
|
||||
message: 'Search robot created successfully.',
|
||||
robot: newRobot,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
logger.log('error', `Error creating search robot: ${error.message}`);
|
||||
return res.status(500).json({ error: error.message });
|
||||
} else {
|
||||
logger.log('error', 'Unknown error creating search robot');
|
||||
return res.status(500).json({ error: 'An unknown error occurred.' });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
export { processQueuedRuns };
|
||||
@@ -16,7 +16,6 @@ function processWorkflow(workflow: WorkflowFile, checkLimit: boolean = false): W
|
||||
|
||||
processedWorkflow.workflow.forEach((pair) => {
|
||||
pair.what.forEach((action) => {
|
||||
// Handle limit validation for scrapeList action
|
||||
if (action.action === 'scrapeList' && checkLimit && Array.isArray(action.args) && action.args.length > 0) {
|
||||
const scrapeConfig = action.args[0];
|
||||
if (scrapeConfig && typeof scrapeConfig === 'object' && 'limit' in scrapeConfig) {
|
||||
@@ -26,7 +25,6 @@ function processWorkflow(workflow: WorkflowFile, checkLimit: boolean = false): W
|
||||
}
|
||||
}
|
||||
|
||||
// Handle decryption for type and press actions
|
||||
if ((action.action === 'type' || action.action === 'press') && Array.isArray(action.args) && action.args.length > 1) {
|
||||
try {
|
||||
const encryptedValue = action.args[1];
|
||||
@@ -93,10 +91,14 @@ export class WorkflowInterpreter {
|
||||
public serializableDataByType: {
|
||||
scrapeSchema: Record<string, any>;
|
||||
scrapeList: Record<string, any>;
|
||||
crawl: Record<string, any>;
|
||||
search: Record<string, any>;
|
||||
[key: string]: any;
|
||||
} = {
|
||||
scrapeSchema: {},
|
||||
scrapeList: {},
|
||||
crawl: {},
|
||||
search: {},
|
||||
};
|
||||
|
||||
private currentActionName: string | null = null;
|
||||
@@ -282,7 +284,6 @@ export class WorkflowInterpreter {
|
||||
}
|
||||
} else if (this.currentActionType === 'scrapeList') {
|
||||
if (data && Array.isArray(data) && data.length > 0) {
|
||||
// Use the current index for persistence
|
||||
await this.persistDataToDatabase('scrapeList', data, this.currentScrapeListIndex);
|
||||
}
|
||||
|
||||
@@ -293,7 +294,6 @@ export class WorkflowInterpreter {
|
||||
}
|
||||
},
|
||||
binaryCallback: async (data: string, mimetype: string) => {
|
||||
// For editor mode, we don't have the name yet, so use a timestamp-based name
|
||||
const binaryItem = {
|
||||
name: `Screenshot ${Date.now()}`,
|
||||
mimeType: mimetype,
|
||||
@@ -301,7 +301,6 @@ export class WorkflowInterpreter {
|
||||
};
|
||||
this.binaryData.push(binaryItem);
|
||||
|
||||
// Persist binary data to database
|
||||
await this.persistBinaryDataToDatabase(binaryItem);
|
||||
|
||||
this.socket.emit('binaryCallback', {
|
||||
@@ -340,7 +339,6 @@ export class WorkflowInterpreter {
|
||||
|
||||
logger.log('debug', `Interpretation finished`);
|
||||
|
||||
// Flush any remaining data in persistence buffer before completing
|
||||
await this.flushPersistenceBuffer();
|
||||
|
||||
this.interpreter = null;
|
||||
@@ -419,6 +417,8 @@ export class WorkflowInterpreter {
|
||||
this.serializableDataByType = {
|
||||
scrapeSchema: {},
|
||||
scrapeList: {},
|
||||
crawl: {},
|
||||
search: {},
|
||||
};
|
||||
this.binaryData = [];
|
||||
this.currentScrapeListIndex = 0;
|
||||
@@ -598,12 +598,20 @@ export class WorkflowInterpreter {
|
||||
typeKey = "scrapeList";
|
||||
} else if (this.currentActionType === "scrapeSchema") {
|
||||
typeKey = "scrapeSchema";
|
||||
} else if (this.currentActionType === "crawl") {
|
||||
typeKey = "crawl";
|
||||
} else if (this.currentActionType === "search") {
|
||||
typeKey = "search";
|
||||
}
|
||||
|
||||
if (typeKey === "scrapeList" && data.scrapeList) {
|
||||
data = data.scrapeList;
|
||||
} else if (typeKey === "scrapeSchema" && data.scrapeSchema) {
|
||||
data = data.scrapeSchema;
|
||||
} else if (typeKey === "crawl" && data.crawl) {
|
||||
data = data.crawl;
|
||||
} else if (typeKey === "search" && data.search) {
|
||||
data = data.search;
|
||||
}
|
||||
|
||||
let actionName = "";
|
||||
@@ -616,38 +624,65 @@ export class WorkflowInterpreter {
|
||||
actionName = keys[keys.length - 1];
|
||||
data = data[actionName];
|
||||
}
|
||||
} else if (typeKey === "crawl" && data && typeof data === "object" && !Array.isArray(data)) {
|
||||
const keys = Object.keys(data);
|
||||
if (keys.length === 1) {
|
||||
actionName = keys[0];
|
||||
data = data[actionName];
|
||||
} else if (keys.length > 1) {
|
||||
actionName = keys[keys.length - 1];
|
||||
data = data[actionName];
|
||||
}
|
||||
} else if (typeKey === "search" && data && typeof data === "object" && !Array.isArray(data)) {
|
||||
const keys = Object.keys(data);
|
||||
if (keys.length === 1) {
|
||||
actionName = keys[0];
|
||||
data = data[actionName];
|
||||
} else if (keys.length > 1) {
|
||||
actionName = keys[keys.length - 1];
|
||||
data = data[actionName];
|
||||
}
|
||||
}
|
||||
|
||||
if (!actionName) {
|
||||
actionName = this.currentActionName || "";
|
||||
if (typeKey === "scrapeList" && !actionName) {
|
||||
actionName = this.getUniqueActionName(typeKey, "");
|
||||
} else if (typeKey === "crawl" && !actionName) {
|
||||
actionName = this.getUniqueActionName(typeKey, "Crawl Results");
|
||||
} else if (typeKey === "search" && !actionName) {
|
||||
actionName = this.getUniqueActionName(typeKey, "Search Results");
|
||||
}
|
||||
}
|
||||
|
||||
const flattened = Array.isArray(data)
|
||||
? data
|
||||
: (
|
||||
data?.List ??
|
||||
(data && typeof data === "object"
|
||||
? Object.values(data).flat?.() ?? data
|
||||
: [])
|
||||
);
|
||||
let processedData;
|
||||
if (typeKey === "search") {
|
||||
processedData = data;
|
||||
} else {
|
||||
processedData = Array.isArray(data)
|
||||
? data
|
||||
: (
|
||||
data?.List ??
|
||||
(data && typeof data === "object"
|
||||
? Object.values(data).flat?.() ?? data
|
||||
: [])
|
||||
);
|
||||
}
|
||||
|
||||
if (!this.serializableDataByType[typeKey]) {
|
||||
this.serializableDataByType[typeKey] = {};
|
||||
}
|
||||
|
||||
this.serializableDataByType[typeKey][actionName] = flattened;
|
||||
this.serializableDataByType[typeKey][actionName] = processedData;
|
||||
|
||||
await this.persistDataToDatabase(typeKey, {
|
||||
[actionName]: flattened,
|
||||
[actionName]: processedData,
|
||||
});
|
||||
|
||||
this.socket.emit("serializableCallback", {
|
||||
type: typeKey,
|
||||
name: actionName,
|
||||
data: flattened,
|
||||
data: processedData,
|
||||
});
|
||||
} catch (err: any) {
|
||||
logger.log('error', `serializableCallback handler failed: ${err.message}`);
|
||||
@@ -705,7 +740,6 @@ export class WorkflowInterpreter {
|
||||
|
||||
await this.flushPersistenceBuffer();
|
||||
|
||||
// Structure the output to maintain separate data for each action type
|
||||
const result = {
|
||||
log: this.debugMessages,
|
||||
result: status,
|
||||
@@ -801,7 +835,7 @@ export class WorkflowInterpreter {
|
||||
|
||||
const currentSerializableOutput = run.serializableOutput ?
|
||||
JSON.parse(JSON.stringify(run.serializableOutput)) :
|
||||
{ scrapeSchema: [], scrapeList: [] };
|
||||
{ scrapeSchema: {}, scrapeList: {}, crawl: {}, search: {} };
|
||||
|
||||
if (Array.isArray(currentSerializableOutput.scrapeList)) {
|
||||
currentSerializableOutput.scrapeList = {};
|
||||
@@ -809,6 +843,9 @@ export class WorkflowInterpreter {
|
||||
if (Array.isArray(currentSerializableOutput.scrapeSchema)) {
|
||||
currentSerializableOutput.scrapeSchema = {};
|
||||
}
|
||||
if (!currentSerializableOutput.search) {
|
||||
currentSerializableOutput.search = {};
|
||||
}
|
||||
|
||||
let hasUpdates = false;
|
||||
|
||||
@@ -834,6 +871,18 @@ export class WorkflowInterpreter {
|
||||
}
|
||||
mergeLists(currentSerializableOutput.scrapeList, item.data);
|
||||
hasUpdates = true;
|
||||
} else if (item.actionType === 'crawl') {
|
||||
currentSerializableOutput.crawl = {
|
||||
...(currentSerializableOutput.crawl || {}),
|
||||
...item.data
|
||||
};
|
||||
hasUpdates = true;
|
||||
} else if (item.actionType === 'search') {
|
||||
currentSerializableOutput.search = {
|
||||
...(currentSerializableOutput.search || {}),
|
||||
...item.data
|
||||
};
|
||||
hasUpdates = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -13,7 +13,11 @@ interface AirtableUpdateTask {
|
||||
|
||||
interface SerializableOutput {
|
||||
scrapeSchema?: Record<string, any[]>;
|
||||
scrapeList?: Record<string, any[]>;
|
||||
scrapeList?: Record<string, any[]>;
|
||||
markdown?: Array<{ content: string }>;
|
||||
html?: Array<{ content: string }>;
|
||||
crawl?: Record<string, any[]>;
|
||||
search?: any;
|
||||
}
|
||||
|
||||
const MAX_RETRIES = 3;
|
||||
@@ -67,6 +71,10 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
const schemaData: Array<{ Group: string; Field: string; Value: any }> = [];
|
||||
const listData: any[] = [];
|
||||
const screenshotData: Array<{ key: string; url: string }> = [];
|
||||
const markdownData: any[] = [];
|
||||
const htmlData: any[] = [];
|
||||
const crawlData: any[] = [];
|
||||
const searchData: any[] = [];
|
||||
|
||||
if (serializableOutput.scrapeSchema) {
|
||||
if (Array.isArray(serializableOutput.scrapeSchema)) {
|
||||
@@ -122,6 +130,66 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.markdown && Array.isArray(serializableOutput.markdown)) {
|
||||
serializableOutput.markdown.forEach((item, index) => {
|
||||
if (item.content) {
|
||||
markdownData.push({
|
||||
"Index": index + 1,
|
||||
"Type": "Markdown",
|
||||
"Content": item.content
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (serializableOutput.html && Array.isArray(serializableOutput.html)) {
|
||||
serializableOutput.html.forEach((item, index) => {
|
||||
if (item.content) {
|
||||
htmlData.push({
|
||||
"Index": index + 1,
|
||||
"Type": "HTML",
|
||||
"Content": item.content
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (serializableOutput.crawl && typeof serializableOutput.crawl === "object") {
|
||||
for (const [crawlName, crawlArray] of Object.entries(serializableOutput.crawl)) {
|
||||
if (Array.isArray(crawlArray)) {
|
||||
crawlArray.forEach((crawlItem) => {
|
||||
const hasContent = Object.values(crawlItem || {}).some(
|
||||
(value) => value !== null && value !== undefined && value !== ""
|
||||
);
|
||||
if (hasContent) {
|
||||
crawlData.push({ "Crawl Type": crawlName, ...crawlItem });
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.search) {
|
||||
let results: any[] = [];
|
||||
|
||||
if (serializableOutput.search.results && Array.isArray(serializableOutput.search.results)) {
|
||||
results = serializableOutput.search.results;
|
||||
} else if (Array.isArray(serializableOutput.search)) {
|
||||
results = serializableOutput.search;
|
||||
} else {
|
||||
results = [serializableOutput.search];
|
||||
}
|
||||
|
||||
results.forEach((result) => {
|
||||
const hasContent = Object.values(result || {}).some(
|
||||
(value) => value !== null && value !== undefined && value !== ""
|
||||
);
|
||||
if (hasContent) {
|
||||
searchData.push(result);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Collect screenshot data (handles both string and object forms safely)
|
||||
// if (binaryOutput && Object.keys(binaryOutput).length > 0) {
|
||||
// Object.entries(binaryOutput).forEach(([key, rawValue]: [string, any]) => {
|
||||
@@ -152,7 +220,15 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
// }
|
||||
|
||||
// --- Merge all types into Airtable rows ---
|
||||
const maxLength = Math.max(schemaData.length, listData.length, screenshotData.length);
|
||||
const maxLength = Math.max(
|
||||
schemaData.length,
|
||||
listData.length,
|
||||
screenshotData.length,
|
||||
markdownData.length,
|
||||
htmlData.length,
|
||||
crawlData.length,
|
||||
searchData.length
|
||||
);
|
||||
|
||||
for (let i = 0; i < maxLength; i++) {
|
||||
const record: Record<string, any> = {};
|
||||
@@ -176,6 +252,38 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
record.Screenshot = screenshotData[i].url;
|
||||
}
|
||||
|
||||
if (i < markdownData.length) {
|
||||
Object.entries(markdownData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (i < htmlData.length) {
|
||||
Object.entries(htmlData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (i < crawlData.length) {
|
||||
Object.entries(crawlData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (i < searchData.length) {
|
||||
Object.entries(searchData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (Object.keys(record).length > 0) {
|
||||
allRecords.push(record);
|
||||
}
|
||||
@@ -194,6 +302,18 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
Screenshot: screenshotData[i].url,
|
||||
});
|
||||
}
|
||||
for (let i = maxLength; i < markdownData.length; i++) {
|
||||
allRecords.push(markdownData[i]);
|
||||
}
|
||||
for (let i = maxLength; i < htmlData.length; i++) {
|
||||
allRecords.push(htmlData[i]);
|
||||
}
|
||||
for (let i = maxLength; i < crawlData.length; i++) {
|
||||
allRecords.push(crawlData[i]);
|
||||
}
|
||||
for (let i = maxLength; i < searchData.length; i++) {
|
||||
allRecords.push(searchData[i]);
|
||||
}
|
||||
|
||||
return allRecords;
|
||||
}
|
||||
|
||||
@@ -13,6 +13,10 @@ interface GoogleSheetUpdateTask {
|
||||
interface SerializableOutput {
|
||||
scrapeSchema?: Record<string, any[]>;
|
||||
scrapeList?: Record<string, any[]>;
|
||||
markdown?: Array<{ content: string }>;
|
||||
html?: Array<{ content: string }>;
|
||||
crawl?: Record<string, any[]>;
|
||||
search?: any;
|
||||
}
|
||||
|
||||
|
||||
@@ -95,6 +99,72 @@ export async function updateGoogleSheet(robotId: string, runId: string) {
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.markdown && Array.isArray(serializableOutput.markdown) && serializableOutput.markdown.length > 0) {
|
||||
const markdownData = serializableOutput.markdown.map((item, index) => ({
|
||||
"Index": index + 1,
|
||||
"Content": item.content || ""
|
||||
}));
|
||||
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
'Markdown',
|
||||
markdownData,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
|
||||
if (serializableOutput.html && Array.isArray(serializableOutput.html) && serializableOutput.html.length > 0) {
|
||||
const htmlData = serializableOutput.html.map((item, index) => ({
|
||||
"Index": index + 1,
|
||||
"Content": item.content || ""
|
||||
}));
|
||||
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
'HTML',
|
||||
htmlData,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
|
||||
if (serializableOutput.crawl && typeof serializableOutput.crawl === "object") {
|
||||
for (const [crawlName, crawlArray] of Object.entries(serializableOutput.crawl)) {
|
||||
if (!Array.isArray(crawlArray) || crawlArray.length === 0) continue;
|
||||
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
`Crawl - ${crawlName}`,
|
||||
crawlArray,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.search) {
|
||||
let searchData: any[] = [];
|
||||
|
||||
if (serializableOutput.search.results && Array.isArray(serializableOutput.search.results)) {
|
||||
searchData = serializableOutput.search.results;
|
||||
} else if (Array.isArray(serializableOutput.search)) {
|
||||
searchData = serializableOutput.search;
|
||||
} else {
|
||||
searchData = [serializableOutput.search];
|
||||
}
|
||||
|
||||
if (searchData.length > 0) {
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
'Search Results',
|
||||
searchData,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (plainRun.binaryOutput && Object.keys(plainRun.binaryOutput).length > 0) {
|
||||
|
||||
@@ -484,6 +484,8 @@ async function executeRun(id: string, userId: string) {
|
||||
const categorizedOutput = {
|
||||
scrapeSchema: finalRun?.serializableOutput?.scrapeSchema || {},
|
||||
scrapeList: finalRun?.serializableOutput?.scrapeList || {},
|
||||
crawl: finalRun?.serializableOutput?.crawl || {},
|
||||
search: finalRun?.serializableOutput?.search || {}
|
||||
};
|
||||
|
||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||
@@ -570,6 +572,8 @@ async function executeRun(id: string, userId: string) {
|
||||
}, {} as Record<string, any[]>)
|
||||
: {},
|
||||
captured_lists: categorizedOutput.scrapeList,
|
||||
crawl_data: categorizedOutput.crawl,
|
||||
search_data: categorizedOutput.search,
|
||||
captured_texts_count: totalSchemaItemsExtracted,
|
||||
captured_lists_count: totalListItemsExtracted,
|
||||
screenshots_count: extractedScreenshotsCount
|
||||
|
||||
@@ -335,4 +335,81 @@ export const deleteSchedule = async (id: string): Promise<boolean> => {
|
||||
console.log(error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const createCrawlRobot = async (
|
||||
url: string,
|
||||
name: string,
|
||||
crawlConfig: {
|
||||
mode: 'domain' | 'subdomain' | 'path';
|
||||
limit: number;
|
||||
maxDepth: number;
|
||||
includePaths: string[];
|
||||
excludePaths: string[];
|
||||
useSitemap: boolean;
|
||||
followLinks: boolean;
|
||||
respectRobots: boolean;
|
||||
}
|
||||
): Promise<any> => {
|
||||
try {
|
||||
const response = await axios.post(
|
||||
`${apiUrl}/storage/recordings/crawl`,
|
||||
{
|
||||
url,
|
||||
name,
|
||||
crawlConfig,
|
||||
},
|
||||
{
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
withCredentials: true,
|
||||
}
|
||||
);
|
||||
|
||||
if (response.status === 201) {
|
||||
return response.data;
|
||||
} else {
|
||||
throw new Error('Failed to create crawl robot');
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error('Error creating crawl robot:', error);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
export const createSearchRobot = async (
|
||||
name: string,
|
||||
searchConfig: {
|
||||
query: string;
|
||||
limit: number;
|
||||
provider: 'google' | 'bing' | 'duckduckgo';
|
||||
filters?: {
|
||||
timeRange?: 'day' | 'week' | 'month' | 'year';
|
||||
location?: string;
|
||||
lang?: string;
|
||||
};
|
||||
mode: 'discover' | 'scrape';
|
||||
}
|
||||
): Promise<any> => {
|
||||
try {
|
||||
const response = await axios.post(
|
||||
`${apiUrl}/storage/recordings/search`,
|
||||
{
|
||||
name,
|
||||
searchConfig,
|
||||
},
|
||||
{
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
withCredentials: true,
|
||||
}
|
||||
);
|
||||
|
||||
if (response.status === 201) {
|
||||
return response.data;
|
||||
} else {
|
||||
throw new Error('Failed to create search robot');
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error('Error creating search robot:', error);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
@@ -154,7 +154,7 @@ export const RobotConfigPage: React.FC<RobotConfigPageProps> = ({
|
||||
)}
|
||||
|
||||
<Box sx={{ display: 'flex', gap: 2 }}>
|
||||
/* {showCancelButton && (
|
||||
{/* {showCancelButton && (
|
||||
<Button
|
||||
variant="outlined"
|
||||
onClick={handleBack}
|
||||
@@ -164,7 +164,7 @@ export const RobotConfigPage: React.FC<RobotConfigPageProps> = ({
|
||||
}} >
|
||||
{cancelButtonText || t("buttons.cancel")}
|
||||
</Button>
|
||||
)} */
|
||||
)} */}
|
||||
{showSaveButton && onSave && (
|
||||
<Button
|
||||
variant="contained"
|
||||
|
||||
@@ -17,12 +17,14 @@ import {
|
||||
FormControl,
|
||||
Select,
|
||||
MenuItem,
|
||||
InputLabel
|
||||
InputLabel,
|
||||
Collapse,
|
||||
FormControlLabel
|
||||
} from '@mui/material';
|
||||
import { ArrowBack, AutoAwesome, HighlightAlt } from '@mui/icons-material';
|
||||
import { useGlobalInfoStore, useCacheInvalidation } from '../../../context/globalInfo';
|
||||
import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording';
|
||||
import { createScrapeRobot, createLLMRobot, createAndRunRecording } from "../../../api/storage";
|
||||
import { createScrapeRobot, createLLMRobot, createAndRunRecording, createCrawlRobot, createSearchRobot } from "../../../api/storage";
|
||||
import { AuthContext } from '../../../context/auth';
|
||||
import { GenericModal } from '../../ui/GenericModal';
|
||||
|
||||
@@ -72,6 +74,25 @@ const RobotCreate: React.FC = () => {
|
||||
const [llmBaseUrl, setLlmBaseUrl] = useState('');
|
||||
const [aiRobotName, setAiRobotName] = useState('');
|
||||
|
||||
const [crawlRobotName, setCrawlRobotName] = useState('');
|
||||
const [crawlUrl, setCrawlUrl] = useState('');
|
||||
const [crawlMode, setCrawlMode] = useState<'domain' | 'subdomain' | 'path'>('domain');
|
||||
const [crawlLimit, setCrawlLimit] = useState(50);
|
||||
const [crawlMaxDepth, setCrawlMaxDepth] = useState(3);
|
||||
const [crawlIncludePaths, setCrawlIncludePaths] = useState<string>('');
|
||||
const [crawlExcludePaths, setCrawlExcludePaths] = useState<string>('');
|
||||
const [crawlUseSitemap, setCrawlUseSitemap] = useState(true);
|
||||
const [crawlFollowLinks, setCrawlFollowLinks] = useState(true);
|
||||
const [crawlRespectRobots, setCrawlRespectRobots] = useState(true);
|
||||
const [showCrawlAdvanced, setShowCrawlAdvanced] = useState(false);
|
||||
|
||||
const [searchRobotName, setSearchRobotName] = useState('');
|
||||
const [searchQuery, setSearchQuery] = useState('');
|
||||
const [searchLimit, setSearchLimit] = useState(10);
|
||||
const [searchProvider] = useState<'duckduckgo'>('duckduckgo');
|
||||
const [searchMode, setSearchMode] = useState<'discover' | 'scrape'>('discover');
|
||||
const [searchTimeRange, setSearchTimeRange] = useState<'day' | 'week' | 'month' | 'year' | ''>('');
|
||||
|
||||
const { state } = React.useContext(AuthContext);
|
||||
const { user } = state;
|
||||
const { addOptimisticRobot, removeOptimisticRobot, invalidateRecordings, invalidateRuns, addOptimisticRun } = useCacheInvalidation();
|
||||
@@ -155,6 +176,76 @@ const RobotCreate: React.FC = () => {
|
||||
navigate('/robots');
|
||||
};
|
||||
|
||||
const handleCreateCrawlRobot = async () => {
|
||||
if (!crawlUrl.trim()) {
|
||||
notify('error', 'Please enter a valid URL');
|
||||
return;
|
||||
}
|
||||
if (!crawlRobotName.trim()) {
|
||||
notify('error', 'Please enter a robot name');
|
||||
return;
|
||||
}
|
||||
|
||||
setIsLoading(true);
|
||||
const result = await createCrawlRobot(
|
||||
crawlUrl,
|
||||
crawlRobotName,
|
||||
{
|
||||
mode: crawlMode,
|
||||
limit: crawlLimit,
|
||||
maxDepth: crawlMaxDepth,
|
||||
includePaths: crawlIncludePaths ? crawlIncludePaths.split(',').map(p => p.trim()) : [],
|
||||
excludePaths: crawlExcludePaths ? crawlExcludePaths.split(',').map(p => p.trim()) : [],
|
||||
useSitemap: crawlUseSitemap,
|
||||
followLinks: crawlFollowLinks,
|
||||
respectRobots: crawlRespectRobots
|
||||
}
|
||||
);
|
||||
setIsLoading(false);
|
||||
|
||||
if (result) {
|
||||
invalidateRecordings();
|
||||
notify('success', `${crawlRobotName} created successfully!`);
|
||||
navigate('/robots');
|
||||
} else {
|
||||
notify('error', 'Failed to create crawl robot');
|
||||
}
|
||||
};
|
||||
|
||||
const handleCreateSearchRobot = async () => {
|
||||
if (!searchQuery.trim()) {
|
||||
notify('error', 'Please enter a search query');
|
||||
return;
|
||||
}
|
||||
if (!searchRobotName.trim()) {
|
||||
notify('error', 'Please enter a robot name');
|
||||
return;
|
||||
}
|
||||
|
||||
setIsLoading(true);
|
||||
const result = await createSearchRobot(
|
||||
searchRobotName,
|
||||
{
|
||||
query: searchQuery,
|
||||
limit: searchLimit,
|
||||
provider: searchProvider,
|
||||
filters: {
|
||||
timeRange: searchTimeRange ? searchTimeRange as 'day' | 'week' | 'month' | 'year' : undefined
|
||||
},
|
||||
mode: searchMode
|
||||
}
|
||||
);
|
||||
setIsLoading(false);
|
||||
|
||||
if (result) {
|
||||
invalidateRecordings();
|
||||
notify('success', `${searchRobotName} created successfully!`);
|
||||
navigate('/robots');
|
||||
} else {
|
||||
notify('error', 'Failed to create search robot');
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<Container maxWidth="md" sx={{ py: 4 }}>
|
||||
<Box>
|
||||
@@ -210,6 +301,8 @@ const RobotCreate: React.FC = () => {
|
||||
>
|
||||
<Tab label="Extract" id="extract-robot" aria-controls="extract-robot" />
|
||||
<Tab label="Scrape" id="scrape-robot" aria-controls="scrape-robot" />
|
||||
<Tab label="Crawl" id="crawl-robot" aria-controls="crawl-robot" />
|
||||
<Tab label="Search" id="search-robot" aria-controls="search-robot" />
|
||||
</Tabs>
|
||||
</Box>
|
||||
|
||||
@@ -729,6 +822,262 @@ const RobotCreate: React.FC = () => {
|
||||
</Box>
|
||||
</Card>
|
||||
</TabPanel>
|
||||
|
||||
<TabPanel value={tabValue} index={2}>
|
||||
<Card sx={{ mb: 4, p: 4, textAlign: 'center' }}>
|
||||
<Box display="flex" flexDirection="column" alignItems="center">
|
||||
<img
|
||||
src="https://ik.imagekit.io/ys1blv5kv/maxunlogo.png"
|
||||
width={73}
|
||||
height={65}
|
||||
style={{
|
||||
borderRadius: '5px',
|
||||
marginBottom: '30px'
|
||||
}}
|
||||
alt="Maxun Logo"
|
||||
/>
|
||||
|
||||
<Typography variant="body2" color="text.secondary" mb={3}>
|
||||
Crawl entire websites and gather data from multiple pages automatically.
|
||||
</Typography>
|
||||
|
||||
<Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
|
||||
<TextField
|
||||
label="Robot Name"
|
||||
placeholder="Example: YC Companies Crawler"
|
||||
fullWidth
|
||||
value={crawlRobotName}
|
||||
onChange={(e) => setCrawlRobotName(e.target.value)}
|
||||
sx={{ mb: 2 }}
|
||||
/>
|
||||
<TextField
|
||||
label="Starting URL"
|
||||
placeholder="https://www.ycombinator.com/companies"
|
||||
fullWidth
|
||||
value={crawlUrl}
|
||||
onChange={(e) => setCrawlUrl(e.target.value)}
|
||||
sx={{ mb: 2 }}
|
||||
/>
|
||||
|
||||
<TextField
|
||||
label="Max Pages to Crawl"
|
||||
type="number"
|
||||
fullWidth
|
||||
value={crawlLimit}
|
||||
onChange={(e) => setCrawlLimit(parseInt(e.target.value) || 10)}
|
||||
sx={{ mb: 2 }}
|
||||
/>
|
||||
|
||||
<Box sx={{ width: '100%', display: 'flex', justifyContent: 'flex-start', mb: 2 }}>
|
||||
<Button
|
||||
onClick={() => setShowCrawlAdvanced(!showCrawlAdvanced)}
|
||||
sx={{
|
||||
textTransform: 'none',
|
||||
color: '#ff00c3',
|
||||
}}
|
||||
>
|
||||
{showCrawlAdvanced ? 'Hide Advanced Options' : 'Advanced Options'}
|
||||
</Button>
|
||||
</Box>
|
||||
|
||||
<Collapse in={showCrawlAdvanced}>
|
||||
<Box sx={{ mb: 2 }}>
|
||||
<FormControl fullWidth sx={{ mb: 2 }}>
|
||||
<InputLabel>Crawl Scope</InputLabel>
|
||||
<Select
|
||||
value={crawlMode}
|
||||
label="Crawl Scope"
|
||||
onChange={(e) => setCrawlMode(e.target.value as any)}
|
||||
>
|
||||
<MenuItem value="domain">Same Domain Only</MenuItem>
|
||||
<MenuItem value="subdomain">Include Subdomains</MenuItem>
|
||||
<MenuItem value="path">Specific Path Only</MenuItem>
|
||||
</Select>
|
||||
</FormControl>
|
||||
|
||||
<TextField
|
||||
label="Max Depth"
|
||||
type="number"
|
||||
fullWidth
|
||||
value={crawlMaxDepth}
|
||||
onChange={(e) => setCrawlMaxDepth(parseInt(e.target.value) || 3)}
|
||||
sx={{ mb: 2 }}
|
||||
helperText="How many links deep to follow (default: 3)"
|
||||
FormHelperTextProps={{ sx: { ml: 0 } }}
|
||||
/>
|
||||
|
||||
<TextField
|
||||
label="Include Paths"
|
||||
placeholder="Example: /products, /blog"
|
||||
fullWidth
|
||||
value={crawlIncludePaths}
|
||||
onChange={(e) => setCrawlIncludePaths(e.target.value)}
|
||||
sx={{ mb: 2 }}
|
||||
helperText="Only crawl URLs matching these paths (comma-separated)"
|
||||
FormHelperTextProps={{ sx: { ml: 0 } }}
|
||||
/>
|
||||
|
||||
<TextField
|
||||
label="Exclude Paths"
|
||||
placeholder="Example: /admin, /login"
|
||||
fullWidth
|
||||
value={crawlExcludePaths}
|
||||
onChange={(e) => setCrawlExcludePaths(e.target.value)}
|
||||
sx={{ mb: 2 }}
|
||||
helperText="Skip URLs matching these paths (comma-separated)"
|
||||
FormHelperTextProps={{ sx: { ml: 0 } }}
|
||||
/>
|
||||
|
||||
<Box sx={{ display: 'flex', flexDirection: 'column', gap: 1 }}>
|
||||
<FormControlLabel
|
||||
control={
|
||||
<Checkbox
|
||||
checked={crawlUseSitemap}
|
||||
onChange={(e) => setCrawlUseSitemap(e.target.checked)}
|
||||
/>
|
||||
}
|
||||
label="Use sitemap.xml for URL discovery"
|
||||
/>
|
||||
<FormControlLabel
|
||||
control={
|
||||
<Checkbox
|
||||
checked={crawlFollowLinks}
|
||||
onChange={(e) => setCrawlFollowLinks(e.target.checked)}
|
||||
/>
|
||||
}
|
||||
label="Follow links on pages"
|
||||
/>
|
||||
<FormControlLabel
|
||||
control={
|
||||
<Checkbox
|
||||
checked={crawlRespectRobots}
|
||||
onChange={(e) => setCrawlRespectRobots(e.target.checked)}
|
||||
/>
|
||||
}
|
||||
label="Respect robots.txt"
|
||||
/>
|
||||
</Box>
|
||||
</Box>
|
||||
</Collapse>
|
||||
</Box>
|
||||
|
||||
<Button
|
||||
variant="contained"
|
||||
fullWidth
|
||||
onClick={handleCreateCrawlRobot}
|
||||
disabled={!crawlUrl.trim() || !crawlRobotName.trim() || isLoading}
|
||||
sx={{
|
||||
bgcolor: '#ff00c3',
|
||||
py: 1.4,
|
||||
fontSize: '1rem',
|
||||
textTransform: 'none',
|
||||
maxWidth: 700,
|
||||
borderRadius: 2
|
||||
}}
|
||||
startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
|
||||
>
|
||||
{isLoading ? 'Creating...' : 'Create Robot'}
|
||||
</Button>
|
||||
</Box>
|
||||
</Card>
|
||||
</TabPanel>
|
||||
|
||||
<TabPanel value={tabValue} index={3}>
|
||||
<Card sx={{ mb: 4, p: 4, textAlign: 'center' }}>
|
||||
<Box display="flex" flexDirection="column" alignItems="center">
|
||||
<img
|
||||
src="https://ik.imagekit.io/ys1blv5kv/maxunlogo.png"
|
||||
width={73}
|
||||
height={65}
|
||||
style={{
|
||||
borderRadius: '5px',
|
||||
marginBottom: '30px'
|
||||
}}
|
||||
alt="Maxun Logo"
|
||||
/>
|
||||
|
||||
<Typography variant="body2" color="text.secondary" mb={3}>
|
||||
Search the web and gather data from relevant results.
|
||||
</Typography>
|
||||
|
||||
<Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
|
||||
<TextField
|
||||
label="Robot Name"
|
||||
placeholder="Example: AI News Monitor"
|
||||
fullWidth
|
||||
value={searchRobotName}
|
||||
onChange={(e) => setSearchRobotName(e.target.value)}
|
||||
sx={{ mb: 2 }}
|
||||
/>
|
||||
|
||||
<TextField
|
||||
label="Search Query"
|
||||
placeholder="Example: latest AI breakthroughs 2025"
|
||||
fullWidth
|
||||
value={searchQuery}
|
||||
onChange={(e) => setSearchQuery(e.target.value)}
|
||||
sx={{ mb: 2 }}
|
||||
/>
|
||||
|
||||
<TextField
|
||||
label="Number of Results"
|
||||
type="number"
|
||||
fullWidth
|
||||
value={searchLimit}
|
||||
onChange={(e) => setSearchLimit(parseInt(e.target.value) || 10)}
|
||||
sx={{ mb: 2 }}
|
||||
/>
|
||||
|
||||
<Box sx={{ display: 'flex', gap: 2 }}>
|
||||
<FormControl fullWidth sx={{ mb: 2 }}>
|
||||
<InputLabel>Mode</InputLabel>
|
||||
<Select
|
||||
value={searchMode}
|
||||
label="Mode"
|
||||
onChange={(e) => setSearchMode(e.target.value as any)}
|
||||
>
|
||||
<MenuItem value="discover">Discover URLs Only</MenuItem>
|
||||
<MenuItem value="scrape">Extract Data from Results</MenuItem>
|
||||
</Select>
|
||||
</FormControl>
|
||||
|
||||
<FormControl fullWidth sx={{ mb: 2 }}>
|
||||
<InputLabel>Time Range</InputLabel>
|
||||
<Select
|
||||
value={searchTimeRange}
|
||||
label="Time Range"
|
||||
onChange={(e) => setSearchTimeRange(e.target.value as 'day' | 'week' | 'month' | 'year' | '')}
|
||||
>
|
||||
<MenuItem value="">No Filter</MenuItem>
|
||||
<MenuItem value="day">Past 24 Hours</MenuItem>
|
||||
<MenuItem value="week">Past Week</MenuItem>
|
||||
<MenuItem value="month">Past Month</MenuItem>
|
||||
<MenuItem value="year">Past Year</MenuItem>
|
||||
</Select>
|
||||
</FormControl>
|
||||
</Box>
|
||||
</Box>
|
||||
|
||||
<Button
|
||||
variant="contained"
|
||||
fullWidth
|
||||
onClick={handleCreateSearchRobot}
|
||||
disabled={!searchQuery.trim() || !searchRobotName.trim() || isLoading}
|
||||
sx={{
|
||||
bgcolor: '#ff00c3',
|
||||
py: 1.4,
|
||||
fontSize: '1rem',
|
||||
textTransform: 'none',
|
||||
maxWidth: 700,
|
||||
borderRadius: 2
|
||||
}}
|
||||
startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
|
||||
>
|
||||
{isLoading ? 'Creating...' : 'Create Robot'}
|
||||
</Button>
|
||||
</Box>
|
||||
</Card>
|
||||
</TabPanel>
|
||||
</Box>
|
||||
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ interface RobotMeta {
|
||||
pairs: number;
|
||||
updatedAt: string;
|
||||
params: any[];
|
||||
type?: 'extract' | 'scrape';
|
||||
type?: 'extract' | 'scrape' | 'crawl' | 'search';
|
||||
url?: string;
|
||||
formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[];
|
||||
isLLM?: boolean;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import React, { useState, useEffect } from "react";
|
||||
import { useState, useEffect } from "react";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import {
|
||||
TextField,
|
||||
@@ -7,7 +7,13 @@ import {
|
||||
Button,
|
||||
IconButton,
|
||||
InputAdornment,
|
||||
Divider,
|
||||
FormControl,
|
||||
InputLabel,
|
||||
Select,
|
||||
MenuItem,
|
||||
FormControlLabel,
|
||||
Checkbox,
|
||||
Collapse
|
||||
} from "@mui/material";
|
||||
import { Visibility, VisibilityOff } from "@mui/icons-material";
|
||||
import { useGlobalInfoStore } from "../../../context/globalInfo";
|
||||
@@ -24,7 +30,7 @@ interface RobotMeta {
|
||||
pairs: number;
|
||||
updatedAt: string;
|
||||
params: any[];
|
||||
type?: 'extract' | 'scrape';
|
||||
type?: 'extract' | 'scrape' | 'crawl' | 'search';
|
||||
url?: string;
|
||||
formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[];
|
||||
isLLM?: boolean;
|
||||
@@ -97,6 +103,25 @@ interface ScrapeListLimit {
|
||||
currentLimit: number;
|
||||
}
|
||||
|
||||
interface CrawlConfig {
|
||||
mode?: string;
|
||||
limit?: number;
|
||||
maxDepth?: number;
|
||||
useSitemap?: boolean;
|
||||
followLinks?: boolean;
|
||||
excludePaths?: string[];
|
||||
includePaths?: string[];
|
||||
respectRobots?: boolean;
|
||||
}
|
||||
|
||||
interface SearchConfig {
|
||||
mode?: 'discover' | 'scrape';
|
||||
limit?: number;
|
||||
query?: string;
|
||||
filters?: Record<string, any>;
|
||||
provider?: string;
|
||||
}
|
||||
|
||||
export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
const { t } = useTranslation();
|
||||
const navigate = useNavigate();
|
||||
@@ -115,6 +140,9 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
[]
|
||||
);
|
||||
const [isLoading, setIsLoading] = useState(false);
|
||||
const [crawlConfig, setCrawlConfig] = useState<CrawlConfig>({});
|
||||
const [searchConfig, setSearchConfig] = useState<SearchConfig>({});
|
||||
const [showCrawlAdvanced, setShowCrawlAdvanced] = useState(false);
|
||||
|
||||
const isEmailPattern = (value: string): boolean => {
|
||||
return value.includes("@");
|
||||
@@ -163,6 +191,8 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
setCredentialGroups(groupCredentialsByType(extractedCredentials));
|
||||
|
||||
findScrapeListLimits(robot.recording.workflow);
|
||||
extractCrawlConfig(robot.recording.workflow);
|
||||
extractSearchConfig(robot.recording.workflow);
|
||||
}
|
||||
}, [robot]);
|
||||
|
||||
@@ -195,6 +225,36 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
setScrapeListLimits(limits);
|
||||
};
|
||||
|
||||
const extractCrawlConfig = (workflow: WhereWhatPair[]) => {
|
||||
workflow.forEach((pair) => {
|
||||
if (!pair.what) return;
|
||||
|
||||
pair.what.forEach((action: any) => {
|
||||
if (action.action === "crawl" && action.args && action.args.length > 0) {
|
||||
const config = action.args[0];
|
||||
if (config && typeof config === "object") {
|
||||
setCrawlConfig(config as CrawlConfig);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
const extractSearchConfig = (workflow: WhereWhatPair[]) => {
|
||||
workflow.forEach((pair) => {
|
||||
if (!pair.what) return;
|
||||
|
||||
pair.what.forEach((action: any) => {
|
||||
if (action.action === "search" && action.args && action.args.length > 0) {
|
||||
const config = action.args[0];
|
||||
if (config && typeof config === "object") {
|
||||
setSearchConfig(config as SearchConfig);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
function extractInitialCredentials(workflow: any[]): Credentials {
|
||||
const credentials: Credentials = {};
|
||||
|
||||
@@ -475,19 +535,17 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
<>
|
||||
{renderCredentialFields(
|
||||
credentialGroups.usernames,
|
||||
t("Username"),
|
||||
"text"
|
||||
t("Username")
|
||||
)}
|
||||
|
||||
{renderCredentialFields(credentialGroups.emails, t("Email"), "text")}
|
||||
{renderCredentialFields(credentialGroups.emails, t("Email"))}
|
||||
|
||||
{renderCredentialFields(
|
||||
credentialGroups.passwords,
|
||||
t("Password"),
|
||||
"password"
|
||||
t("Password")
|
||||
)}
|
||||
|
||||
{renderCredentialFields(credentialGroups.others, t("Other"), "text")}
|
||||
{renderCredentialFields(credentialGroups.others, t("Other"))}
|
||||
</>
|
||||
);
|
||||
};
|
||||
@@ -502,7 +560,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
</Typography>
|
||||
|
||||
{scrapeListLimits.map((limitInfo, index) => {
|
||||
// Get the corresponding scrapeList action to extract its name
|
||||
const scrapeListAction = robot?.recording?.workflow?.[limitInfo.pairIndex]?.what?.[limitInfo.actionIndex];
|
||||
const actionName =
|
||||
scrapeListAction?.name ||
|
||||
@@ -542,7 +599,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
const screenshotInputs: JSX.Element[] = [];
|
||||
const listInputs: JSX.Element[] = [];
|
||||
|
||||
let textCount = 0;
|
||||
let screenshotCount = 0;
|
||||
let listCount = 0;
|
||||
|
||||
@@ -683,7 +739,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
const renderCredentialFields = (
|
||||
selectors: string[],
|
||||
headerText: string,
|
||||
defaultType: "text" | "password" = "text"
|
||||
) => {
|
||||
if (selectors.length === 0) return null;
|
||||
|
||||
@@ -737,6 +792,193 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
return url;
|
||||
};
|
||||
|
||||
const renderCrawlConfigFields = () => {
|
||||
if (robot?.recording_meta.type !== 'crawl') return null;
|
||||
|
||||
return (
|
||||
<>
|
||||
<TextField
|
||||
label="Max Pages to Crawl"
|
||||
type="number"
|
||||
fullWidth
|
||||
value={crawlConfig.limit || 10}
|
||||
onChange={(e) => {
|
||||
const value = parseInt(e.target.value, 10);
|
||||
if (value >= 1) {
|
||||
setCrawlConfig((prev) => ({ ...prev, limit: value }));
|
||||
}
|
||||
}}
|
||||
inputProps={{ min: 1 }}
|
||||
style={{ marginBottom: "20px" }}
|
||||
/>
|
||||
|
||||
<Button
|
||||
onClick={() => setShowCrawlAdvanced(!showCrawlAdvanced)}
|
||||
sx={{
|
||||
mb: 2,
|
||||
textTransform: 'none',
|
||||
color: '#ff00c3'
|
||||
}}
|
||||
>
|
||||
{showCrawlAdvanced ? 'Hide Advanced Options' : 'Advanced Options'}
|
||||
</Button>
|
||||
|
||||
<Collapse in={showCrawlAdvanced}>
|
||||
<Box sx={{ mb: 2 }}>
|
||||
<FormControl fullWidth sx={{ mb: 2 }}>
|
||||
<InputLabel>Crawl Scope</InputLabel>
|
||||
<Select
|
||||
value={crawlConfig.mode || 'domain'}
|
||||
label="Crawl Scope"
|
||||
onChange={(e) => setCrawlConfig((prev) => ({ ...prev, mode: e.target.value }))}
|
||||
>
|
||||
<MenuItem value="domain">Same Domain Only</MenuItem>
|
||||
<MenuItem value="subdomain">Include Subdomains</MenuItem>
|
||||
<MenuItem value="path">Specific Path Only</MenuItem>
|
||||
</Select>
|
||||
</FormControl>
|
||||
|
||||
<TextField
|
||||
label="Max Depth"
|
||||
type="number"
|
||||
fullWidth
|
||||
value={crawlConfig.maxDepth || 3}
|
||||
onChange={(e) => {
|
||||
const value = parseInt(e.target.value, 10);
|
||||
if (value >= 1) {
|
||||
setCrawlConfig((prev) => ({ ...prev, maxDepth: value }));
|
||||
}
|
||||
}}
|
||||
inputProps={{ min: 1 }}
|
||||
sx={{ mb: 2 }}
|
||||
helperText="How many links deep to follow (default: 3)"
|
||||
/>
|
||||
|
||||
<TextField
|
||||
label="Include Paths"
|
||||
placeholder="Example: /products, /blog"
|
||||
fullWidth
|
||||
value={crawlConfig.includePaths?.join(', ') || ''}
|
||||
onChange={(e) => {
|
||||
const paths = e.target.value ? e.target.value.split(',').map(p => p.trim()) : [];
|
||||
setCrawlConfig((prev) => ({ ...prev, includePaths: paths }));
|
||||
}}
|
||||
sx={{ mb: 2 }}
|
||||
helperText="Only crawl URLs matching these paths (comma-separated)"
|
||||
/>
|
||||
|
||||
<TextField
|
||||
label="Exclude Paths"
|
||||
placeholder="Example: /admin, /login"
|
||||
fullWidth
|
||||
value={crawlConfig.excludePaths?.join(', ') || ''}
|
||||
onChange={(e) => {
|
||||
const paths = e.target.value ? e.target.value.split(',').map(p => p.trim()) : [];
|
||||
setCrawlConfig((prev) => ({ ...prev, excludePaths: paths }));
|
||||
}}
|
||||
sx={{ mb: 2 }}
|
||||
helperText="Skip URLs matching these paths (comma-separated)"
|
||||
/>
|
||||
|
||||
<Box sx={{ display: 'flex', flexDirection: 'column', gap: 1 }}>
|
||||
<FormControlLabel
|
||||
control={
|
||||
<Checkbox
|
||||
checked={crawlConfig.useSitemap ?? true}
|
||||
onChange={(e) => setCrawlConfig((prev) => ({ ...prev, useSitemap: e.target.checked }))}
|
||||
/>
|
||||
}
|
||||
label="Use sitemap.xml for URL discovery"
|
||||
/>
|
||||
<FormControlLabel
|
||||
control={
|
||||
<Checkbox
|
||||
checked={crawlConfig.followLinks ?? true}
|
||||
onChange={(e) => setCrawlConfig((prev) => ({ ...prev, followLinks: e.target.checked }))}
|
||||
/>
|
||||
}
|
||||
label="Follow links on pages"
|
||||
/>
|
||||
<FormControlLabel
|
||||
control={
|
||||
<Checkbox
|
||||
checked={crawlConfig.respectRobots ?? true}
|
||||
onChange={(e) => setCrawlConfig((prev) => ({ ...prev, respectRobots: e.target.checked }))}
|
||||
/>
|
||||
}
|
||||
label="Respect robots.txt"
|
||||
/>
|
||||
</Box>
|
||||
</Box>
|
||||
</Collapse>
|
||||
</>
|
||||
);
|
||||
};
|
||||
|
||||
const renderSearchConfigFields = () => {
|
||||
if (robot?.recording_meta.type !== 'search') return null;
|
||||
|
||||
return (
|
||||
<>
|
||||
<TextField
|
||||
label="Search Query"
|
||||
placeholder="Example: latest AI breakthroughs 2025"
|
||||
fullWidth
|
||||
value={searchConfig.query || ''}
|
||||
onChange={(e) => {
|
||||
setSearchConfig((prev) => ({ ...prev, query: e.target.value }));
|
||||
}}
|
||||
sx={{ mb: 2 }}
|
||||
/>
|
||||
|
||||
<TextField
|
||||
label="Number of Results"
|
||||
type="number"
|
||||
fullWidth
|
||||
value={searchConfig.limit || 10}
|
||||
onChange={(e) => {
|
||||
const value = parseInt(e.target.value, 10);
|
||||
if (value >= 1) {
|
||||
setSearchConfig((prev) => ({ ...prev, limit: value }));
|
||||
}
|
||||
}}
|
||||
inputProps={{ min: 1 }}
|
||||
sx={{ mb: 2 }}
|
||||
/>
|
||||
|
||||
<FormControl fullWidth sx={{ mb: 2 }}>
|
||||
<InputLabel>Mode</InputLabel>
|
||||
<Select
|
||||
value={searchConfig.mode || 'discover'}
|
||||
label="Mode"
|
||||
onChange={(e) => setSearchConfig((prev) => ({ ...prev, mode: e.target.value as 'discover' | 'scrape' }))}
|
||||
>
|
||||
<MenuItem value="discover">Discover URLs Only</MenuItem>
|
||||
<MenuItem value="scrape">Extract Data from Results</MenuItem>
|
||||
</Select>
|
||||
</FormControl>
|
||||
|
||||
<FormControl fullWidth sx={{ mb: 2 }}>
|
||||
<InputLabel>Time Range</InputLabel>
|
||||
<Select
|
||||
value={searchConfig.filters?.timeRange || ''}
|
||||
label="Time Range"
|
||||
onChange={(e) => setSearchConfig((prev) => ({
|
||||
...prev,
|
||||
filters: { ...prev.filters, timeRange: e.target.value as '' | 'day' | 'week' | 'month' | 'year' || undefined }
|
||||
}))}
|
||||
>
|
||||
<MenuItem value="">No Filter</MenuItem>
|
||||
<MenuItem value="day">Past 24 Hours</MenuItem>
|
||||
<MenuItem value="week">Past Week</MenuItem>
|
||||
<MenuItem value="month">Past Month</MenuItem>
|
||||
<MenuItem value="year">Past Year</MenuItem>
|
||||
</Select>
|
||||
</FormControl>
|
||||
</>
|
||||
);
|
||||
};
|
||||
|
||||
const handleSave = async () => {
|
||||
if (!robot) return;
|
||||
|
||||
@@ -757,6 +999,48 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
|
||||
const targetUrl = getTargetUrl();
|
||||
|
||||
let updatedWorkflow = robot.recording.workflow;
|
||||
if (robot.recording_meta.type === 'crawl') {
|
||||
updatedWorkflow = updatedWorkflow.map((pair: any) => {
|
||||
if (!pair.what) return pair;
|
||||
|
||||
return {
|
||||
...pair,
|
||||
what: pair.what.map((action: any) => {
|
||||
if (action.action === 'crawl') {
|
||||
return {
|
||||
...action,
|
||||
args: [{ ...crawlConfig }]
|
||||
};
|
||||
}
|
||||
return action;
|
||||
})
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
if (robot.recording_meta.type === 'search') {
|
||||
updatedWorkflow = updatedWorkflow.map((pair: any) => {
|
||||
if (!pair.what) return pair;
|
||||
|
||||
return {
|
||||
...pair,
|
||||
what: pair.what.map((action: any) => {
|
||||
if (action.action === 'search') {
|
||||
return {
|
||||
...action,
|
||||
args: [{
|
||||
...searchConfig,
|
||||
provider: 'duckduckgo'
|
||||
}]
|
||||
};
|
||||
}
|
||||
return action;
|
||||
})
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
const payload: any = {
|
||||
name: robot.recording_meta.name,
|
||||
limits: scrapeListLimits.map((limit) => ({
|
||||
@@ -767,8 +1051,7 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
})),
|
||||
credentials: credentialsForPayload,
|
||||
targetUrl: targetUrl,
|
||||
// send the (possibly edited) workflow so backend can persist action name changes
|
||||
workflow: robot.recording.workflow,
|
||||
workflow: updatedWorkflow,
|
||||
};
|
||||
|
||||
const success = await updateRecording(robot.recording_meta.id, payload);
|
||||
@@ -818,26 +1101,21 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
style={{ marginBottom: "20px" }}
|
||||
/>
|
||||
|
||||
<TextField
|
||||
label={t("robot_duplication.fields.target_url")}
|
||||
key={t("robot_duplication.fields.target_url")}
|
||||
value={getTargetUrl() || ""}
|
||||
onChange={(e) => handleTargetUrlChange(e.target.value)}
|
||||
style={{ marginBottom: "20px" }}
|
||||
/>
|
||||
{renderScrapeListLimitFields() && (
|
||||
<>
|
||||
<Divider />
|
||||
{renderScrapeListLimitFields()}
|
||||
</>
|
||||
{robot.recording_meta.type !== 'search' && (
|
||||
<TextField
|
||||
label={t("robot_duplication.fields.target_url")}
|
||||
key={t("robot_duplication.fields.target_url")}
|
||||
value={getTargetUrl() || ""}
|
||||
onChange={(e) => handleTargetUrlChange(e.target.value)}
|
||||
style={{ marginBottom: "20px" }}
|
||||
/>
|
||||
)}
|
||||
|
||||
{renderCrawlConfigFields()}
|
||||
{renderSearchConfigFields()}
|
||||
|
||||
{renderActionNameFields() && (
|
||||
<>
|
||||
<Divider />
|
||||
{renderActionNameFields()}
|
||||
</>
|
||||
)}
|
||||
{renderScrapeListLimitFields()}
|
||||
{renderActionNameFields()}
|
||||
</>
|
||||
)}
|
||||
</Box>
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import React, { useState, useEffect } from "react";
|
||||
import { useState, useEffect } from "react";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import { TextField, Typography, Box, Card, CardContent } from "@mui/material";
|
||||
import { Settings, Info } from "@mui/icons-material";
|
||||
import { TextField, Box } from "@mui/material";
|
||||
import { useGlobalInfoStore } from "../../../context/globalInfo";
|
||||
import { getStoredRecording } from "../../../api/storage";
|
||||
import { WhereWhatPair } from "maxun-core";
|
||||
@@ -16,7 +15,7 @@ interface RobotMeta {
|
||||
pairs: number;
|
||||
updatedAt: string;
|
||||
params: any[];
|
||||
type?: 'extract' | 'scrape';
|
||||
type?: 'extract' | 'scrape' | 'crawl' | 'search';
|
||||
url?: string;
|
||||
formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[];
|
||||
isLLM?: boolean;
|
||||
@@ -116,19 +115,11 @@ export const RobotSettingsPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
fetchUserEmail();
|
||||
}, [robot?.userId]);
|
||||
|
||||
const handleCancel = () => {
|
||||
const basePath = location.pathname.includes("/prebuilt-robots")
|
||||
? "/prebuilt-robots"
|
||||
: "/robots";
|
||||
navigate(basePath);
|
||||
};
|
||||
|
||||
const targetUrl = getTargetUrl();
|
||||
|
||||
return (
|
||||
<RobotConfigPage
|
||||
title={t("robot_settings.title")}
|
||||
onCancel={handleCancel}
|
||||
cancelButtonText={t("robot_settings.buttons.close")}
|
||||
showSaveButton={false}
|
||||
showCancelButton={false}
|
||||
@@ -137,15 +128,17 @@ export const RobotSettingsPage = ({ handleStart }: RobotSettingsProps) => {
|
||||
<Box style={{ display: "flex", flexDirection: "column" }}>
|
||||
{robot && (
|
||||
<>
|
||||
<TextField
|
||||
label={t("robot_settings.target_url")}
|
||||
key="Robot Target URL"
|
||||
value={targetUrl}
|
||||
InputProps={{
|
||||
readOnly: true,
|
||||
}}
|
||||
style={{ marginBottom: "20px" }}
|
||||
/>
|
||||
{robot.recording_meta.type !== 'search' && (
|
||||
<TextField
|
||||
label={t("robot_settings.target_url")}
|
||||
key="Robot Target URL"
|
||||
value={targetUrl}
|
||||
InputProps={{
|
||||
readOnly: true,
|
||||
}}
|
||||
style={{ marginBottom: "20px" }}
|
||||
/>
|
||||
)}
|
||||
<TextField
|
||||
label={t("robot_settings.robot_id")}
|
||||
key="Robot ID"
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
import {
|
||||
Box,
|
||||
Tabs,
|
||||
Typography,
|
||||
Tab,
|
||||
Paper,
|
||||
Button,
|
||||
CircularProgress,
|
||||
Accordion,
|
||||
AccordionSummary,
|
||||
AccordionDetails
|
||||
AccordionDetails,
|
||||
Link
|
||||
} from "@mui/material";
|
||||
import * as React from "react";
|
||||
import { Data } from "./RunsTable";
|
||||
import { TabPanel, TabContext } from "@mui/lab";
|
||||
import ExpandMoreIcon from '@mui/icons-material/ExpandMore';
|
||||
import { useEffect, useState } from "react";
|
||||
import JSZip from "jszip";
|
||||
import Table from '@mui/material/Table';
|
||||
import TableBody from '@mui/material/TableBody';
|
||||
import TableCell from '@mui/material/TableCell';
|
||||
@@ -37,7 +37,9 @@ interface RunContentProps {
|
||||
} | null,
|
||||
}
|
||||
|
||||
export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRef, abortRunHandler, workflowProgress }: RunContentProps) => { const { t } = useTranslation();
|
||||
export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRef, abortRunHandler, workflowProgress }: RunContentProps) => {
|
||||
const { t } = useTranslation();
|
||||
const { darkMode } = useThemeMode();
|
||||
const [tab, setTab] = React.useState<string>('output');
|
||||
const [markdownContent, setMarkdownContent] = useState<string>('');
|
||||
const [htmlContent, setHtmlContent] = useState<string>('');
|
||||
@@ -54,6 +56,15 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
const [listKeys, setListKeys] = useState<string[]>([]);
|
||||
const [currentListIndex, setCurrentListIndex] = useState<number>(0);
|
||||
|
||||
const [crawlData, setCrawlData] = useState<any[][]>([]);
|
||||
const [crawlColumns, setCrawlColumns] = useState<string[][]>([]);
|
||||
const [crawlKeys, setCrawlKeys] = useState<string[]>([]);
|
||||
const [currentCrawlIndex, setCurrentCrawlIndex] = useState<number>(0);
|
||||
|
||||
const [searchData, setSearchData] = useState<any[]>([]);
|
||||
const [searchMode, setSearchMode] = useState<'discover' | 'scrape'>('discover');
|
||||
const [currentSearchIndex, setCurrentSearchIndex] = useState<number>(0);
|
||||
|
||||
const [screenshotKeys, setScreenshotKeys] = useState<string[]>([]);
|
||||
const [screenshotKeyMap, setScreenshotKeyMap] = useState<Record<string, string>>({});
|
||||
const [currentScreenshotIndex, setCurrentScreenshotIndex] = useState<number>(0);
|
||||
@@ -106,6 +117,10 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
setListData([]);
|
||||
setListColumns([]);
|
||||
setListKeys([]);
|
||||
setCrawlData([]);
|
||||
setCrawlColumns([]);
|
||||
setCrawlKeys([]);
|
||||
setSearchData([]);
|
||||
setLegacyData([]);
|
||||
setLegacyColumns([]);
|
||||
setIsLegacyData(false);
|
||||
@@ -117,7 +132,7 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
|
||||
const hasLegacySchema = row.serializableOutput.scrapeSchema && Array.isArray(row.serializableOutput.scrapeSchema);
|
||||
const hasLegacyList = row.serializableOutput.scrapeList && Array.isArray(row.serializableOutput.scrapeList);
|
||||
const hasOldFormat = !row.serializableOutput.scrapeSchema && !row.serializableOutput.scrapeList && Object.keys(row.serializableOutput).length > 0;
|
||||
const hasOldFormat = !row.serializableOutput.scrapeSchema && !row.serializableOutput.scrapeList && !row.serializableOutput.crawl && !row.serializableOutput.search && Object.keys(row.serializableOutput).length > 0;
|
||||
|
||||
if (hasLegacySchema || hasLegacyList || hasOldFormat) {
|
||||
processLegacyData(row.serializableOutput);
|
||||
@@ -134,6 +149,14 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
if (row.serializableOutput.scrapeList) {
|
||||
processScrapeList(row.serializableOutput.scrapeList);
|
||||
}
|
||||
|
||||
if (row.serializableOutput.crawl) {
|
||||
processCrawl(row.serializableOutput.crawl);
|
||||
}
|
||||
|
||||
if (row.serializableOutput.search) {
|
||||
processSearch(row.serializableOutput.search);
|
||||
}
|
||||
}, [row.serializableOutput, row.status]);
|
||||
|
||||
useEffect(() => {
|
||||
@@ -152,7 +175,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
let normalizedScreenshotKeys: string[];
|
||||
|
||||
if (isLegacyPattern) {
|
||||
// Legacy unnamed screenshots → Screenshot 1, Screenshot 2...
|
||||
normalizedScreenshotKeys = rawKeys.map((_, index) => `Screenshot ${index + 1}`);
|
||||
} else {
|
||||
normalizedScreenshotKeys = rawKeys.map((key, index) => {
|
||||
@@ -355,6 +377,76 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
setCurrentListIndex(0);
|
||||
};
|
||||
|
||||
const processCrawl = (crawlDataInput: any) => {
|
||||
const tablesList: any[][] = [];
|
||||
const columnsList: string[][] = [];
|
||||
const keys: string[] = [];
|
||||
|
||||
if (typeof crawlDataInput === 'object') {
|
||||
Object.keys(crawlDataInput).forEach(key => {
|
||||
const tableData = crawlDataInput[key];
|
||||
|
||||
if (Array.isArray(tableData) && tableData.length > 0) {
|
||||
const filteredData = tableData.filter(row =>
|
||||
row && typeof row === 'object' && Object.values(row).some(value => value !== undefined && value !== "")
|
||||
);
|
||||
|
||||
if (filteredData.length > 0) {
|
||||
tablesList.push(filteredData);
|
||||
keys.push(key);
|
||||
const tableColumns = new Set<string>();
|
||||
filteredData.forEach(item => {
|
||||
Object.keys(item).forEach(key => tableColumns.add(key));
|
||||
});
|
||||
columnsList.push(Array.from(tableColumns));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
setCrawlData(tablesList);
|
||||
setCrawlColumns(columnsList);
|
||||
const normalizedCrawlKeys = keys.map((key, index) => {
|
||||
if (!key || key.toLowerCase().includes("crawl")) {
|
||||
return `Crawl ${index + 1}`;
|
||||
}
|
||||
return key;
|
||||
});
|
||||
|
||||
setCrawlKeys(normalizedCrawlKeys);
|
||||
setCurrentCrawlIndex(0);
|
||||
};
|
||||
|
||||
const processSearch = (searchDataInput: any) => {
|
||||
if (typeof searchDataInput === 'object') {
|
||||
const keys = Object.keys(searchDataInput);
|
||||
|
||||
if (keys.length > 0) {
|
||||
const searchKey = keys[0];
|
||||
const searchInfo = searchDataInput[searchKey];
|
||||
|
||||
if (searchInfo && searchInfo.results && Array.isArray(searchInfo.results)) {
|
||||
const mode = searchInfo.mode || 'discover';
|
||||
setSearchMode(mode);
|
||||
|
||||
if (mode === 'scrape') {
|
||||
setSearchData(searchInfo.results);
|
||||
} else {
|
||||
const normalizedResults = searchInfo.results.map((result: any, index: number) => ({
|
||||
title: result.title || '-',
|
||||
url: result.url || '-',
|
||||
description: result.description || '-',
|
||||
position: result.position || index + 1,
|
||||
}));
|
||||
setSearchData(normalizedResults);
|
||||
}
|
||||
|
||||
setCurrentSearchIndex(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const convertToCSV = (data: any[], columns: string[], isSchemaData: boolean = false, isTabular: boolean = false): string => {
|
||||
if (isSchemaData && !isTabular && data.length === 1) {
|
||||
const header = 'Label,Value';
|
||||
@@ -375,7 +467,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
}
|
||||
};
|
||||
|
||||
// Function to download a specific dataset as CSV
|
||||
const downloadCSV = (data: any[], columns: string[], filename: string, isSchemaData: boolean = false, isTabular: boolean = false) => {
|
||||
const csvContent = convertToCSV(data, columns, isSchemaData, isTabular);
|
||||
const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' });
|
||||
@@ -426,6 +517,33 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
}, 100);
|
||||
};
|
||||
|
||||
const downloadAllCrawlsAsZip = async (crawlDataArray: any[], zipFilename: string) => {
|
||||
const zip = new JSZip();
|
||||
|
||||
crawlDataArray.forEach((item, index) => {
|
||||
const url = item?.metadata?.url || item?.url || '';
|
||||
const filename = url
|
||||
? url.replace(/^https?:\/\//, '').replace(/\//g, '_').replace(/[^a-zA-Z0-9_.-]/g, '_') + '.json'
|
||||
: `crawl_url_${index + 1}.json`;
|
||||
|
||||
const jsonContent = JSON.stringify(item, null, 2);
|
||||
zip.file(filename, jsonContent);
|
||||
});
|
||||
|
||||
const blob = await zip.generateAsync({ type: 'blob' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
|
||||
const link = document.createElement("a");
|
||||
link.href = url;
|
||||
link.setAttribute("download", zipFilename);
|
||||
document.body.appendChild(link);
|
||||
link.click();
|
||||
document.body.removeChild(link);
|
||||
|
||||
setTimeout(() => {
|
||||
URL.revokeObjectURL(url);
|
||||
}, 100);
|
||||
};
|
||||
|
||||
const renderDataTable = (
|
||||
data: any[],
|
||||
@@ -433,14 +551,13 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
title: string,
|
||||
csvFilename: string,
|
||||
jsonFilename: string,
|
||||
isPaginatedList: boolean = false,
|
||||
isSchemaData: boolean = false
|
||||
) => {
|
||||
if (data.length === 0) return null;
|
||||
|
||||
const shouldShowAsKeyValue = isSchemaData && !isSchemaTabular && data.length === 1;
|
||||
|
||||
if (title === '') {
|
||||
if (!title || title.trim() === '') {
|
||||
return (
|
||||
<>
|
||||
<Box sx={{ mb: 2 }}>
|
||||
@@ -686,7 +803,7 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
);
|
||||
};
|
||||
|
||||
const hasData = schemaData.length > 0 || listData.length > 0 || legacyData.length > 0;
|
||||
const hasData = schemaData.length > 0 || listData.length > 0 || crawlData.length > 0 || searchData.length > 0 || legacyData.length > 0;
|
||||
const hasScreenshots = row.binaryOutput && Object.keys(row.binaryOutput).length > 0;
|
||||
const hasMarkdown = markdownContent.length > 0;
|
||||
const hasHTML = htmlContent.length > 0;
|
||||
@@ -818,7 +935,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
)}
|
||||
</>
|
||||
) : (
|
||||
// Extract robot output
|
||||
<>
|
||||
{row.status === 'running' || row.status === 'queued' ? (
|
||||
<>
|
||||
@@ -910,7 +1026,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
'',
|
||||
`${schemaKeys[currentSchemaIndex] || 'schema_data'}.csv`,
|
||||
`${schemaKeys[currentSchemaIndex] || 'schema_data'}.json`,
|
||||
false,
|
||||
true
|
||||
)}
|
||||
</AccordionDetails>
|
||||
@@ -1059,6 +1174,588 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
|
||||
{crawlData.length > 0 && crawlData[0] && crawlData[0].length > 0 && (
|
||||
<Accordion defaultExpanded sx={{ mb: 2 }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center' }}>
|
||||
<Typography variant='h6'>
|
||||
Crawl Results
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
<Box
|
||||
sx={{
|
||||
display: 'flex',
|
||||
overflowX: 'auto',
|
||||
borderBottom: '1px solid',
|
||||
borderColor: darkMode ? '#2a3441' : '#dee2e6',
|
||||
mb: 2,
|
||||
'&::-webkit-scrollbar': {
|
||||
height: '8px',
|
||||
},
|
||||
'&::-webkit-scrollbar-track': {
|
||||
backgroundColor: darkMode ? '#1e1e1e' : '#f1f1f1',
|
||||
},
|
||||
'&::-webkit-scrollbar-thumb': {
|
||||
backgroundColor: darkMode ? '#555' : '#888',
|
||||
borderRadius: '4px',
|
||||
},
|
||||
'&::-webkit-scrollbar-thumb:hover': {
|
||||
backgroundColor: '#FF00C3',
|
||||
},
|
||||
}}
|
||||
>
|
||||
{crawlData[0].map((item: any, idx: number) => {
|
||||
const url = item?.metadata?.url || item?.url || `URL ${idx + 1}`;
|
||||
|
||||
return (
|
||||
<Box
|
||||
key={idx}
|
||||
onClick={() => setCurrentCrawlIndex(idx)}
|
||||
sx={{
|
||||
px: 2,
|
||||
py: 1,
|
||||
cursor: 'pointer',
|
||||
backgroundColor: currentCrawlIndex === idx
|
||||
? darkMode ? '#121111ff' : '#e9ecef'
|
||||
: 'transparent',
|
||||
borderBottom: currentCrawlIndex === idx ? '3px solid #FF00C3' : 'none',
|
||||
color: darkMode ? '#fff' : '#000',
|
||||
whiteSpace: 'nowrap',
|
||||
fontSize: '0.875rem',
|
||||
flexShrink: 0,
|
||||
}}
|
||||
title={url}
|
||||
>
|
||||
Link {idx + 1}
|
||||
</Box>
|
||||
);
|
||||
})}
|
||||
</Box>
|
||||
|
||||
{crawlData[0][currentCrawlIndex] && (
|
||||
<>
|
||||
<Accordion defaultExpanded sx={{ mb: 2 }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center' }}>
|
||||
<Typography variant='h6'>
|
||||
Metadata
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
<TableContainer component={Paper} sx={{ maxHeight: 300 }}>
|
||||
<Table size="small">
|
||||
<TableBody>
|
||||
{crawlData[0][currentCrawlIndex].metadata &&
|
||||
Object.entries(crawlData[0][currentCrawlIndex].metadata).map(([key, value]: [string, any]) => (
|
||||
<TableRow key={key}>
|
||||
<TableCell sx={{ fontWeight: 500, width: '200px' }}>
|
||||
{key}
|
||||
</TableCell>
|
||||
<TableCell sx={{ wordBreak: 'break-word' }}>
|
||||
{value === undefined || value === '' ? '-' : String(value)}
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
))
|
||||
}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</TableContainer>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
|
||||
{crawlData[0][currentCrawlIndex].text && (
|
||||
<Accordion defaultExpanded sx={{ mb: 2 }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center' }}>
|
||||
<Typography variant='h6'>
|
||||
Text Content
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
<Paper
|
||||
sx={{
|
||||
p: 2,
|
||||
maxHeight: '300px',
|
||||
overflow: 'auto',
|
||||
backgroundColor: darkMode ? '#1e1e1e' : '#f5f5f5'
|
||||
}}
|
||||
>
|
||||
<Typography
|
||||
component="pre"
|
||||
sx={{
|
||||
whiteSpace: 'pre-wrap',
|
||||
wordBreak: 'break-word',
|
||||
fontFamily: 'monospace',
|
||||
fontSize: '0.75rem',
|
||||
m: 0
|
||||
}}
|
||||
>
|
||||
{crawlData[0][currentCrawlIndex].text}
|
||||
</Typography>
|
||||
</Paper>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
)}
|
||||
|
||||
{crawlData[0][currentCrawlIndex].html && (
|
||||
<Accordion defaultExpanded sx={{ mb: 2 }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center' }}>
|
||||
<Typography variant='h6'>
|
||||
HTML
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
<Paper
|
||||
sx={{
|
||||
p: 2,
|
||||
maxHeight: '300px',
|
||||
overflow: 'auto',
|
||||
backgroundColor: darkMode ? '#1e1e1e' : '#f5f5f5'
|
||||
}}
|
||||
>
|
||||
<Typography
|
||||
component="pre"
|
||||
sx={{
|
||||
whiteSpace: 'pre-wrap',
|
||||
wordBreak: 'break-word',
|
||||
fontFamily: 'monospace',
|
||||
fontSize: '0.65rem',
|
||||
m: 0
|
||||
}}
|
||||
>
|
||||
{crawlData[0][currentCrawlIndex].html}
|
||||
</Typography>
|
||||
</Paper>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
)}
|
||||
|
||||
{crawlData[0][currentCrawlIndex].links && crawlData[0][currentCrawlIndex].links.length > 0 && (
|
||||
<Accordion defaultExpanded sx={{ mb: 2 }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center' }}>
|
||||
<Typography variant='h6'>
|
||||
Links ({crawlData[0][currentCrawlIndex].links.length})
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
<Paper sx={{ maxHeight: 200, overflow: 'auto', p: 1 }}>
|
||||
{crawlData[0][currentCrawlIndex].links.map((link: string, idx: number) => (
|
||||
<Typography key={idx} sx={{ fontSize: '0.75rem', mb: 0.5 }}>
|
||||
{link}
|
||||
</Typography>
|
||||
))}
|
||||
</Paper>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
)}
|
||||
|
||||
<Box sx={{ display: 'flex', gap: 2, mt: 2 }}>
|
||||
<Button
|
||||
onClick={() => {
|
||||
const currentUrl = crawlData[0][currentCrawlIndex]?.metadata?.url || crawlData[0][currentCrawlIndex]?.url || '';
|
||||
const filename = currentUrl
|
||||
? currentUrl.replace(/^https?:\/\//, '').replace(/\//g, '_').replace(/[^a-zA-Z0-9_.-]/g, '_') + '.json'
|
||||
: `crawl_url_${currentCrawlIndex + 1}.json`;
|
||||
downloadJSON(
|
||||
[crawlData[0][currentCrawlIndex]],
|
||||
filename
|
||||
);
|
||||
}}
|
||||
sx={{
|
||||
color: '#FF00C3',
|
||||
textTransform: 'none',
|
||||
p: 0,
|
||||
minWidth: 'auto',
|
||||
backgroundColor: 'transparent',
|
||||
'&:hover': {
|
||||
backgroundColor: 'transparent',
|
||||
textDecoration: 'underline',
|
||||
},
|
||||
}}
|
||||
>
|
||||
Download This Page as JSON
|
||||
</Button>
|
||||
|
||||
<Button
|
||||
onClick={() => {
|
||||
const firstUrl = crawlData[0][0]?.metadata?.url || crawlData[0][0]?.url || '';
|
||||
const baseFilename = firstUrl
|
||||
? firstUrl.replace(/^https?:\/\//, '').split('/')[0].replace(/[^a-zA-Z0-9_.-]/g, '_')
|
||||
: 'crawl';
|
||||
downloadAllCrawlsAsZip(
|
||||
crawlData[0],
|
||||
`${baseFilename}_all_urls.zip`
|
||||
);
|
||||
}}
|
||||
sx={{
|
||||
color: '#FF00C3',
|
||||
textTransform: 'none',
|
||||
p: 0,
|
||||
minWidth: 'auto',
|
||||
backgroundColor: 'transparent',
|
||||
'&:hover': {
|
||||
backgroundColor: 'transparent',
|
||||
textDecoration: 'underline',
|
||||
},
|
||||
}}
|
||||
>
|
||||
Download All Pages as JSON
|
||||
</Button>
|
||||
</Box>
|
||||
</>
|
||||
)}
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
)}
|
||||
|
||||
{searchData.length > 0 && (
|
||||
<Accordion defaultExpanded sx={{ mb: 2 }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center' }}>
|
||||
<Typography variant='h6'>
|
||||
Search Results
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
{searchMode === 'scrape' && searchData.length > 0 ? (
|
||||
<>
|
||||
<Box
|
||||
sx={{
|
||||
display: 'flex',
|
||||
overflowX: 'auto',
|
||||
borderBottom: '1px solid',
|
||||
borderColor: darkMode ? '#2a3441' : '#dee2e6',
|
||||
mb: 2,
|
||||
'&::-webkit-scrollbar': {
|
||||
height: '8px',
|
||||
},
|
||||
'&::-webkit-scrollbar-track': {
|
||||
backgroundColor: darkMode ? '#1e1e1e' : '#f1f1f1',
|
||||
},
|
||||
'&::-webkit-scrollbar-thumb': {
|
||||
backgroundColor: darkMode ? '#555' : '#888',
|
||||
borderRadius: '4px',
|
||||
},
|
||||
'&::-webkit-scrollbar-thumb:hover': {
|
||||
backgroundColor: '#FF00C3',
|
||||
},
|
||||
}}
|
||||
>
|
||||
{searchData.map((item: any, idx: number) => {
|
||||
const url = item?.metadata?.url || item?.url || `Result ${idx + 1}`;
|
||||
|
||||
return (
|
||||
<Box
|
||||
key={idx}
|
||||
onClick={() => setCurrentSearchIndex(idx)}
|
||||
sx={{
|
||||
px: 2,
|
||||
py: 1,
|
||||
cursor: 'pointer',
|
||||
backgroundColor: currentSearchIndex === idx
|
||||
? darkMode ? '#121111ff' : '#e9ecef'
|
||||
: 'transparent',
|
||||
borderBottom: currentSearchIndex === idx ? '3px solid #FF00C3' : 'none',
|
||||
color: darkMode ? '#fff' : '#000',
|
||||
whiteSpace: 'nowrap',
|
||||
fontSize: '0.875rem',
|
||||
flexShrink: 0,
|
||||
}}
|
||||
title={url}
|
||||
>
|
||||
Link {idx + 1}
|
||||
</Box>
|
||||
);
|
||||
})}
|
||||
</Box>
|
||||
|
||||
{searchData[currentSearchIndex] && (
|
||||
<>
|
||||
<Accordion defaultExpanded sx={{ mb: 2 }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center' }}>
|
||||
<Typography variant='h6'>
|
||||
Metadata
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
<TableContainer component={Paper} sx={{ maxHeight: 300 }}>
|
||||
<Table size="small">
|
||||
<TableBody>
|
||||
{searchData[currentSearchIndex].metadata &&
|
||||
Object.entries(searchData[currentSearchIndex].metadata).map(([key, value]: [string, any]) => (
|
||||
<TableRow key={key}>
|
||||
<TableCell sx={{ fontWeight: 500, width: '200px' }}>
|
||||
{key}
|
||||
</TableCell>
|
||||
<TableCell sx={{ wordBreak: 'break-word' }}>
|
||||
{value === undefined || value === '' ? '-' : String(value)}
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
))
|
||||
}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</TableContainer>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
|
||||
{searchData[currentSearchIndex].text && (
|
||||
<Accordion defaultExpanded sx={{ mb: 2 }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center' }}>
|
||||
<Typography variant='h6'>
|
||||
Text Content
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
<Paper
|
||||
sx={{
|
||||
p: 2,
|
||||
maxHeight: '300px',
|
||||
overflow: 'auto',
|
||||
backgroundColor: darkMode ? '#1e1e1e' : '#f5f5f5'
|
||||
}}
|
||||
>
|
||||
<Typography
|
||||
component="pre"
|
||||
sx={{
|
||||
whiteSpace: 'pre-wrap',
|
||||
wordBreak: 'break-word',
|
||||
fontFamily: 'monospace',
|
||||
fontSize: '0.75rem',
|
||||
m: 0
|
||||
}}
|
||||
>
|
||||
{searchData[currentSearchIndex].text}
|
||||
</Typography>
|
||||
</Paper>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
)}
|
||||
|
||||
{searchData[currentSearchIndex].html && (
|
||||
<Accordion defaultExpanded sx={{ mb: 2 }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center' }}>
|
||||
<Typography variant='h6'>
|
||||
HTML
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
<Paper
|
||||
sx={{
|
||||
p: 2,
|
||||
maxHeight: '300px',
|
||||
overflow: 'auto',
|
||||
backgroundColor: darkMode ? '#1e1e1e' : '#f5f5f5'
|
||||
}}
|
||||
>
|
||||
<Typography
|
||||
component="pre"
|
||||
sx={{
|
||||
whiteSpace: 'pre-wrap',
|
||||
wordBreak: 'break-word',
|
||||
fontFamily: 'monospace',
|
||||
fontSize: '0.75rem',
|
||||
m: 0
|
||||
}}
|
||||
>
|
||||
{searchData[currentSearchIndex].html}
|
||||
</Typography>
|
||||
</Paper>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
)}
|
||||
|
||||
{searchData[currentSearchIndex].markdown && (
|
||||
<Accordion defaultExpanded sx={{ mb: 2 }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center' }}>
|
||||
<Typography variant='h6'>
|
||||
Markdown
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
<Paper
|
||||
sx={{
|
||||
p: 2,
|
||||
maxHeight: '300px',
|
||||
overflow: 'auto',
|
||||
backgroundColor: darkMode ? '#1e1e1e' : '#f5f5f5'
|
||||
}}
|
||||
>
|
||||
<Typography
|
||||
component="pre"
|
||||
sx={{
|
||||
whiteSpace: 'pre-wrap',
|
||||
wordBreak: 'break-word',
|
||||
fontFamily: 'monospace',
|
||||
fontSize: '0.75rem',
|
||||
m: 0
|
||||
}}
|
||||
>
|
||||
{searchData[currentSearchIndex].markdown}
|
||||
</Typography>
|
||||
</Paper>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
)}
|
||||
|
||||
{searchData[currentSearchIndex].links && searchData[currentSearchIndex].links.length > 0 && (
|
||||
<Accordion defaultExpanded sx={{ mb: 2 }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center' }}>
|
||||
<Typography variant='h6'>
|
||||
Links ({searchData[currentSearchIndex].links.length})
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
<Paper sx={{ maxHeight: 200, overflow: 'auto', p: 1 }}>
|
||||
{searchData[currentSearchIndex].links.map((link: string, idx: number) => (
|
||||
<Typography key={idx} sx={{ fontSize: '0.75rem', mb: 0.5 }}>
|
||||
{link}
|
||||
</Typography>
|
||||
))}
|
||||
</Paper>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
)}
|
||||
|
||||
<Box sx={{ display: 'flex', alignItems: 'center', mt: 2 }}>
|
||||
<Button
|
||||
onClick={() => {
|
||||
const result = searchData[currentSearchIndex];
|
||||
const filename = `search_result_${currentSearchIndex + 1}.json`;
|
||||
downloadJSON(result, filename);
|
||||
}}
|
||||
sx={{
|
||||
color: '#FF00C3',
|
||||
textTransform: 'none',
|
||||
mr: 2,
|
||||
p: 0,
|
||||
minWidth: 'auto',
|
||||
backgroundColor: 'transparent',
|
||||
'&:hover': {
|
||||
backgroundColor: 'transparent',
|
||||
textDecoration: 'underline',
|
||||
},
|
||||
}}
|
||||
>
|
||||
Download as JSON
|
||||
</Button>
|
||||
</Box>
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<TableContainer component={Paper} sx={{ maxHeight: 600 }}>
|
||||
<Table stickyHeader aria-label="search-results-table">
|
||||
<TableHead>
|
||||
<TableRow>
|
||||
<TableCell
|
||||
sx={{
|
||||
backgroundColor: darkMode ? '#11111' : '#f8f9fa',
|
||||
minWidth: '200px'
|
||||
}}
|
||||
>
|
||||
Title
|
||||
</TableCell>
|
||||
<TableCell
|
||||
sx={{
|
||||
backgroundColor: darkMode ? '#11111' : '#f8f9fa',
|
||||
minWidth: '250px'
|
||||
}}
|
||||
>
|
||||
URL
|
||||
</TableCell>
|
||||
<TableCell
|
||||
sx={{
|
||||
backgroundColor: darkMode ? '#11111' : '#f8f9fa',
|
||||
minWidth: '300px'
|
||||
}}
|
||||
>
|
||||
Description
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
</TableHead>
|
||||
|
||||
<TableBody>
|
||||
{searchData.map((result: any, idx: number) => (
|
||||
<TableRow key={idx}>
|
||||
<TableCell>
|
||||
{result.title || '-'}
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
{result.url ? (
|
||||
<Link
|
||||
href={result.url}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
sx={{
|
||||
color: '#FF00C3',
|
||||
textDecoration: 'none',
|
||||
'&:hover': {
|
||||
textDecoration: 'underline'
|
||||
},
|
||||
wordBreak: 'break-all'
|
||||
}}
|
||||
>
|
||||
{result.url}
|
||||
</Link>
|
||||
) : '-'}
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
{result.description || '-'}
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
))}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</TableContainer>
|
||||
|
||||
<Box sx={{ display: 'flex', alignItems: 'center', mt: 2 }}>
|
||||
<Button
|
||||
onClick={() => {
|
||||
downloadJSON(searchData, 'search_results.json');
|
||||
}}
|
||||
sx={{
|
||||
color: '#FF00C3',
|
||||
textTransform: 'none',
|
||||
mr: 2,
|
||||
p: 0,
|
||||
minWidth: 'auto',
|
||||
backgroundColor: 'transparent',
|
||||
'&:hover': {
|
||||
backgroundColor: 'transparent',
|
||||
textDecoration: 'underline',
|
||||
},
|
||||
}}
|
||||
>
|
||||
Download as JSON
|
||||
</Button>
|
||||
</Box>
|
||||
</>
|
||||
)}
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
)}
|
||||
</Box>
|
||||
)}
|
||||
|
||||
|
||||
@@ -56,6 +56,7 @@ export interface Data {
|
||||
runByScheduleId?: string;
|
||||
browserId: string;
|
||||
runByAPI?: boolean;
|
||||
runBySDK?: boolean;
|
||||
log: string;
|
||||
runId: string;
|
||||
robotId: string;
|
||||
|
||||
@@ -27,7 +27,7 @@ interface RobotMeta {
|
||||
pairs: number;
|
||||
updatedAt: string;
|
||||
params: any[];
|
||||
type?: 'extract' | 'scrape';
|
||||
type?: 'extract' | 'scrape' | 'crawl' | 'search';
|
||||
url?: string;
|
||||
formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[];
|
||||
isLLM?: boolean;
|
||||
|
||||
Reference in New Issue
Block a user