Making file parser flexible to deprecate pdf parser (#3073)

Co-authored-by: Suchintan <suchintan@users.noreply.github.com>
This commit is contained in:
PHSB
2025-08-06 11:15:04 -06:00
committed by GitHub
parent 31aa7d6973
commit 468f5c6051
15 changed files with 555 additions and 49 deletions

View File

@@ -19,7 +19,7 @@ Building blocks supported today:
- TextPromptBlock: A text only prompt block. - TextPromptBlock: A text only prompt block.
- SendEmailBlock: Send an email. - SendEmailBlock: Send an email.
- FileDownloadBlock: Given a goal, Skyvern downloads a file from the website. - FileDownloadBlock: Given a goal, Skyvern downloads a file from the website.
- FileParserBlock: Given a file url, Skyvern downloads the file from the url, and returns the parsed content as the output of the block. Currently only support CSV file format. - FileParserBlock: Given a file url, Skyvern downloads the file from the url, and returns the parsed content as the output of the block. Supports CSV, Excel, and PDF file formats.
- PDFParserBlock: Given a pdf url, Skyvern downloads the PDF file from the url and returns the parsed content as the output of the block. - PDFParserBlock: Given a pdf url, Skyvern downloads the PDF file from the url and returns the parsed content as the output of the block.
- FileUploadBlock: Upload all the downloaded files to a desired destination. Currently only AWS S3 is supported. Please contact support@skyvern.com if you need more integrations. - FileUploadBlock: Upload all the downloaded files to a desired destination. Currently only AWS S3 is supported. Please contact support@skyvern.com if you need more integrations.
- WaitBlock: Wait for a given amount of time. - WaitBlock: Wait for a given amount of time.

View File

@@ -43,7 +43,7 @@ This block sends an email.
This block downloads a file from the website. This block downloads a file from the website.
## FileParserBlock ## FileParserBlock
This block parses a file from the website. This block parses PDFs, CSVs, and Excel files from the website.
## PDFParserBlock ## PDFParserBlock
This block parses a PDF file from the website. This block parses a PDF file from the website.

View File

@@ -228,16 +228,16 @@ Inputs:
Downloads and parses a file to be used within other workflow blocks. Downloads and parses a file to be used within other workflow blocks.
**Supported types:** CSV **Supported types:** CSV, TSV, Excel, PDF
``` ```
- block_type: file_url_parser - block_type: file_url_parser
label: csv_parser label: file_parser
file_type: csv file_type: csv # Auto-detected from URL extension
file_url: <csv_file_url> file_url: <file_url>
``` ```
Inputs: Inputs:
1. **File URL *(required):*** This block allows you to use a CSV within your workflow. 1. **File URL *(required):*** This block allows you to use CSV, TSV, Excel, and PDF files within your workflow.
* Since were still in beta, you will need to [contact us](https://meetings.hubspot.com/skyvern/demo?uuid=7c83865f-1a92-4c44-9e52-1ba0dbc04f7a) to load a value into this block * Since were still in beta, you will need to [contact us](https://meetings.hubspot.com/skyvern/demo?uuid=7c83865f-1a92-4c44-9e52-1ba0dbc04f7a) to load a value into this block

51
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. # This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
[[package]] [[package]]
name = "about-time" name = "about-time"
@@ -1587,6 +1587,18 @@ files = [
[package.extras] [package.extras]
mypy = ["mypy"] mypy = ["mypy"]
[[package]]
name = "et-xmlfile"
version = "2.0.0"
description = "An implementation of lxml.xmlfile for the standard library"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"},
{file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"},
]
[[package]] [[package]]
name = "exceptiongroup" name = "exceptiongroup"
version = "1.3.0" version = "1.3.0"
@@ -2374,7 +2386,7 @@ description = "Lightweight in-process concurrent programming"
optional = false optional = false
python-versions = ">=3.9" python-versions = ">=3.9"
groups = ["main"] groups = ["main"]
markers = "python_version == \"3.12\" or python_version == \"3.13\"" markers = "python_version >= \"3.12\""
files = [ files = [
{file = "greenlet-3.2.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:1afd685acd5597349ee6d7a88a8bec83ce13c106ac78c196ee9dde7c04fe87be"}, {file = "greenlet-3.2.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:1afd685acd5597349ee6d7a88a8bec83ce13c106ac78c196ee9dde7c04fe87be"},
{file = "greenlet-3.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:761917cac215c61e9dc7324b2606107b3b292a8349bdebb31503ab4de3f559ac"}, {file = "greenlet-3.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:761917cac215c61e9dc7324b2606107b3b292a8349bdebb31503ab4de3f559ac"},
@@ -3183,7 +3195,7 @@ description = "Low-level, pure Python DBus protocol wrapper."
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
groups = ["dev"] groups = ["dev"]
markers = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and sys_platform == \"linux\"" markers = "sys_platform == \"linux\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\""
files = [ files = [
{file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"}, {file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"},
{file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"}, {file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"},
@@ -4829,7 +4841,7 @@ description = "ONNX Runtime is a runtime accelerator for Machine Learning models
optional = false optional = false
python-versions = ">=3.10" python-versions = ">=3.10"
groups = ["main"] groups = ["main"]
markers = "python_version == \"3.12\" or python_version == \"3.13\"" markers = "python_version >= \"3.12\""
files = [ files = [
{file = "onnxruntime-1.22.0-cp310-cp310-macosx_13_0_universal2.whl", hash = "sha256:85d8826cc8054e4d6bf07f779dc742a363c39094015bdad6a08b3c18cfe0ba8c"}, {file = "onnxruntime-1.22.0-cp310-cp310-macosx_13_0_universal2.whl", hash = "sha256:85d8826cc8054e4d6bf07f779dc742a363c39094015bdad6a08b3c18cfe0ba8c"},
{file = "onnxruntime-1.22.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:468c9502a12f6f49ec335c2febd22fdceecc1e4cc96dfc27e419ba237dff5aff"}, {file = "onnxruntime-1.22.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:468c9502a12f6f49ec335c2febd22fdceecc1e4cc96dfc27e419ba237dff5aff"},
@@ -4937,6 +4949,21 @@ jsonschema-path = ">=0.3.1,<0.4.0"
lazy-object-proxy = ">=1.7.1,<2.0.0" lazy-object-proxy = ">=1.7.1,<2.0.0"
openapi-schema-validator = ">=0.6.0,<0.7.0" openapi-schema-validator = ">=0.6.0,<0.7.0"
[[package]]
name = "openpyxl"
version = "3.1.5"
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
]
[package.dependencies]
et-xmlfile = "*"
[[package]] [[package]]
name = "opentelemetry-api" name = "opentelemetry-api"
version = "1.34.1" version = "1.34.1"
@@ -5930,7 +5957,7 @@ description = "A high-level API to automate web browsers"
optional = false optional = false
python-versions = ">=3.9" python-versions = ">=3.9"
groups = ["main"] groups = ["main"]
markers = "python_version == \"3.12\" or python_version == \"3.13\"" markers = "python_version >= \"3.12\""
files = [ files = [
{file = "playwright-1.53.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:48a1a15ce810f0ffe512b6050de9871ea193b41dd3cc1bbed87b8431012419ba"}, {file = "playwright-1.53.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:48a1a15ce810f0ffe512b6050de9871ea193b41dd3cc1bbed87b8431012419ba"},
{file = "playwright-1.53.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a701f9498a5b87e3f929ec01cea3109fbde75821b19c7ba4bba54f6127b94f76"}, {file = "playwright-1.53.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a701f9498a5b87e3f929ec01cea3109fbde75821b19c7ba4bba54f6127b94f76"},
@@ -6227,7 +6254,7 @@ description = "PostgreSQL database adapter for Python"
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
groups = ["main"] groups = ["main"]
markers = "python_version == \"3.12\" or python_version == \"3.11\"" markers = "python_version < \"3.13\""
files = [ files = [
{file = "psycopg-3.1.18-py3-none-any.whl", hash = "sha256:4d5a0a5a8590906daa58ebd5f3cfc34091377354a1acced269dd10faf55da60e"}, {file = "psycopg-3.1.18-py3-none-any.whl", hash = "sha256:4d5a0a5a8590906daa58ebd5f3cfc34091377354a1acced269dd10faf55da60e"},
{file = "psycopg-3.1.18.tar.gz", hash = "sha256:31144d3fb4c17d78094d9e579826f047d4af1da6a10427d91dfcfb6ecdf6f12b"}, {file = "psycopg-3.1.18.tar.gz", hash = "sha256:31144d3fb4c17d78094d9e579826f047d4af1da6a10427d91dfcfb6ecdf6f12b"},
@@ -6280,7 +6307,7 @@ description = "PostgreSQL database adapter for Python -- C optimisation distribu
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
groups = ["main"] groups = ["main"]
markers = "(python_version == \"3.12\" or python_version == \"3.11\") and implementation_name != \"pypy\"" markers = "python_version < \"3.13\" and implementation_name != \"pypy\""
files = [ files = [
{file = "psycopg_binary-3.1.18-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5c323103dfa663b88204cf5f028e83c77d7a715f9b6f51d2bbc8184b99ddd90a"}, {file = "psycopg_binary-3.1.18-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5c323103dfa663b88204cf5f028e83c77d7a715f9b6f51d2bbc8184b99ddd90a"},
{file = "psycopg_binary-3.1.18-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:887f8d856c91510148be942c7acd702ccf761a05f59f8abc123c22ab77b5a16c"}, {file = "psycopg_binary-3.1.18-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:887f8d856c91510148be942c7acd702ccf761a05f59f8abc123c22ab77b5a16c"},
@@ -6731,7 +6758,7 @@ description = "A rough port of Node.js's EventEmitter to Python with a few trick
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["main"] groups = ["main"]
markers = "python_version == \"3.12\" or python_version == \"3.13\"" markers = "python_version >= \"3.12\""
files = [ files = [
{file = "pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498"}, {file = "pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498"},
{file = "pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37"}, {file = "pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37"},
@@ -7044,7 +7071,7 @@ description = "A (partial) reimplementation of pywin32 using ctypes/cffi"
optional = false optional = false
python-versions = ">=3.6" python-versions = ">=3.6"
groups = ["dev"] groups = ["dev"]
markers = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and sys_platform == \"win32\"" markers = "sys_platform == \"win32\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\""
files = [ files = [
{file = "pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755"}, {file = "pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755"},
{file = "pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8"}, {file = "pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8"},
@@ -7784,7 +7811,7 @@ description = "Python bindings to FreeDesktop.org Secret Service API"
optional = false optional = false
python-versions = ">=3.6" python-versions = ">=3.6"
groups = ["dev"] groups = ["dev"]
markers = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and sys_platform == \"linux\"" markers = "sys_platform == \"linux\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\""
files = [ files = [
{file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"}, {file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"},
{file = "SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77"}, {file = "SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77"},
@@ -9128,7 +9155,7 @@ description = "Fast implementation of asyncio event loop on top of libuv"
optional = false optional = false
python-versions = ">=3.8.0" python-versions = ">=3.8.0"
groups = ["main"] groups = ["main"]
markers = "sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\"" markers = "platform_python_implementation != \"PyPy\" and sys_platform != \"win32\" and sys_platform != \"cygwin\""
files = [ files = [
{file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f"}, {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f"},
{file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d"}, {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d"},
@@ -9763,4 +9790,4 @@ type = ["pytest-mypy"]
[metadata] [metadata]
lock-version = "2.1" lock-version = "2.1"
python-versions = ">=3.11,<3.14" python-versions = ">=3.11,<3.14"
content-hash = "667e626dd8d08bae4f9b852616a9c2c5df9bdd4a3a37f7ef87030d7bfdf51f3b" content-hash = "441c7080f7fbccb87de476e56dd86d1e56e4c0b8eaa8378d13c90d94a0a42123"

View File

@@ -78,6 +78,7 @@ lark = "^1.2.2"
libcst = "^1.8.2" libcst = "^1.8.2"
curlparser = "^0.1.0" curlparser = "^0.1.0"
lmnr = {extras = ["all"], version = "^0.7.0"} lmnr = {extras = ["all"], version = "^0.7.0"}
openpyxl = "^3.1.5"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
isort = "^5.13.2" isort = "^5.13.2"

View File

@@ -10,6 +10,8 @@ import { useDebugStore } from "@/store/useDebugStore";
import { cn } from "@/util/utils"; import { cn } from "@/util/utils";
import { NodeHeader } from "../components/NodeHeader"; import { NodeHeader } from "../components/NodeHeader";
import { useParams } from "react-router-dom"; import { useParams } from "react-router-dom";
import { WorkflowDataSchemaInputGroup } from "@/components/DataSchemaInputGroup/WorkflowDataSchemaInputGroup";
import { dataSchemaExampleForFileExtraction } from "../types";
function FileParserNode({ id, data }: NodeProps<FileParserNode>) { function FileParserNode({ id, data }: NodeProps<FileParserNode>) {
const { updateNodeData } = useReactFlow(); const { updateNodeData } = useReactFlow();
@@ -21,8 +23,17 @@ function FileParserNode({ id, data }: NodeProps<FileParserNode>) {
urlBlockLabel !== undefined && urlBlockLabel === label; urlBlockLabel !== undefined && urlBlockLabel === label;
const [inputs, setInputs] = useState({ const [inputs, setInputs] = useState({
fileUrl: data.fileUrl, fileUrl: data.fileUrl,
jsonSchema: data.jsonSchema,
}); });
function handleChange(key: string, value: unknown) {
if (!data.editable) {
return;
}
setInputs({ ...inputs, [key]: value });
updateNodeData(id, { [key]: value });
}
const isFirstWorkflowBlock = useIsFirstBlockInWorkflow({ id }); const isFirstWorkflowBlock = useIsFirstBlockInWorkflow({ id });
return ( return (
@@ -75,15 +86,19 @@ function FileParserNode({ id, data }: NodeProps<FileParserNode>) {
nodeId={id} nodeId={id}
value={inputs.fileUrl} value={inputs.fileUrl}
onChange={(value) => { onChange={(value) => {
if (!data.editable) { handleChange("fileUrl", value);
return;
}
setInputs({ ...inputs, fileUrl: value });
updateNodeData(id, { fileUrl: value });
}} }}
className="nopan text-xs" className="nopan text-xs"
/> />
</div> </div>
<WorkflowDataSchemaInputGroup
exampleValue={dataSchemaExampleForFileExtraction}
value={inputs.jsonSchema}
onChange={(value) => {
handleChange("jsonSchema", value);
}}
suggestionContext={{}}
/>
</div> </div>
</div> </div>
</div> </div>

View File

@@ -1,9 +1,11 @@
import type { Node } from "@xyflow/react"; import type { Node } from "@xyflow/react";
import { NodeBaseData } from "../types"; import { NodeBaseData } from "../types";
import { AppNode } from "..";
import { debuggableWorkflowBlockTypes } from "@/routes/workflows/types/workflowTypes"; import { debuggableWorkflowBlockTypes } from "@/routes/workflows/types/workflowTypes";
export type FileParserNodeData = NodeBaseData & { export type FileParserNodeData = NodeBaseData & {
fileUrl: string; fileUrl: string;
jsonSchema: string;
}; };
export type FileParserNode = Node<FileParserNodeData, "fileParser">; export type FileParserNode = Node<FileParserNodeData, "fileParser">;
@@ -14,5 +16,10 @@ export const fileParserNodeDefaultData: FileParserNodeData = {
label: "", label: "",
fileUrl: "", fileUrl: "",
continueOnFailure: false, continueOnFailure: false,
jsonSchema: "null",
model: null, model: null,
} as const; } as const;
export function isFileParserNode(node: AppNode): node is FileParserNode {
return node.type === "fileParser";
}

View File

@@ -162,20 +162,19 @@ const nodeLibraryItems: Array<{
/> />
), ),
title: "File Parser Block", title: "File Parser Block",
description: "Parse data from files", description: "Parse PDFs, CSVs, and Excel files",
},
{
nodeType: "pdfParser",
icon: (
<WorkflowBlockIcon
workflowBlockType={WorkflowBlockTypes.PDFParser}
className="size-6"
/>
),
title: "PDF Parser Block",
description: "Extract data from PDF files",
}, },
// { // {
// nodeType: "pdfParser",
// icon: (
// <WorkflowBlockIcon
// workflowBlockType={WorkflowBlockTypes.PDFParser}
// className="size-6"
// />
// ),
// title: "PDF Parser Block",
// description: "Extract data from PDF files",
// },
// nodeType: "upload", // nodeType: "upload",
// icon: ( // icon: (
// <WorkflowBlockIcon // <WorkflowBlockIcon

View File

@@ -56,7 +56,10 @@ import { ParametersState } from "./types";
import { AppNode, isWorkflowBlockNode, WorkflowBlockNode } from "./nodes"; import { AppNode, isWorkflowBlockNode, WorkflowBlockNode } from "./nodes";
import { codeBlockNodeDefaultData } from "./nodes/CodeBlockNode/types"; import { codeBlockNodeDefaultData } from "./nodes/CodeBlockNode/types";
import { downloadNodeDefaultData } from "./nodes/DownloadNode/types"; import { downloadNodeDefaultData } from "./nodes/DownloadNode/types";
import { fileParserNodeDefaultData } from "./nodes/FileParserNode/types"; import {
isFileParserNode,
fileParserNodeDefaultData,
} from "./nodes/FileParserNode/types";
import { import {
isLoopNode, isLoopNode,
LoopNode, LoopNode,
@@ -468,6 +471,7 @@ function convertToNode(
data: { data: {
...commonData, ...commonData,
fileUrl: block.file_url, fileUrl: block.file_url,
jsonSchema: JSON.stringify(block.json_schema, null, 2),
}, },
}; };
} }
@@ -1254,7 +1258,8 @@ function getWorkflowBlock(node: WorkflowBlockNode): BlockYAML {
...base, ...base,
block_type: "file_url_parser", block_type: "file_url_parser",
file_url: node.data.fileUrl, file_url: node.data.fileUrl,
file_type: "csv", file_type: "csv", // Backend will auto-detect based on file extension
json_schema: JSONParseSafe(node.data.jsonSchema),
}; };
} }
case "textPrompt": { case "textPrompt": {
@@ -2187,6 +2192,15 @@ function getWorkflowErrors(nodes: Array<AppNode>): Array<string> {
} }
}); });
const fileParserNodes = nodes.filter(isFileParserNode);
fileParserNodes.forEach((node) => {
try {
JSON.parse(node.data.jsonSchema);
} catch {
errors.push(`${node.data.label}: Data schema is not valid JSON.`);
}
});
const waitNodes = nodes.filter(isWaitNode); const waitNodes = nodes.filter(isWaitNode);
waitNodes.forEach((node) => { waitNodes.forEach((node) => {
const waitTimeString = node.data.waitInSeconds.trim(); const waitTimeString = node.data.waitInSeconds.trim();

View File

@@ -354,7 +354,8 @@ export type SendEmailBlock = WorkflowBlockBase & {
export type FileURLParserBlock = WorkflowBlockBase & { export type FileURLParserBlock = WorkflowBlockBase & {
block_type: "file_url_parser"; block_type: "file_url_parser";
file_url: string; file_url: string;
file_type: "csv"; file_type: "csv" | "excel" | "pdf";
json_schema: Record<string, unknown> | null;
}; };
export type ValidationBlock = WorkflowBlockBase & { export type ValidationBlock = WorkflowBlockBase & {

View File

@@ -308,7 +308,8 @@ export type SendEmailBlockYAML = BlockYAMLBase & {
export type FileUrlParserBlockYAML = BlockYAMLBase & { export type FileUrlParserBlockYAML = BlockYAMLBase & {
block_type: "file_url_parser"; block_type: "file_url_parser";
file_url: string; file_url: string;
file_type: "csv"; file_type: "csv" | "excel" | "pdf";
json_schema?: Record<string, unknown> | null;
}; };
export type ForLoopBlockYAML = BlockYAMLBase & { export type ForLoopBlockYAML = BlockYAMLBase & {

View File

@@ -21,6 +21,7 @@ from typing import Annotated, Any, Awaitable, Callable, Literal, Union
from urllib.parse import quote, urlparse from urllib.parse import quote, urlparse
import filetype import filetype
import pandas as pd
import structlog import structlog
from email_validator import EmailNotValidError, validate_email from email_validator import EmailNotValidError, validate_email
from jinja2.sandbox import SandboxedEnvironment from jinja2.sandbox import SandboxedEnvironment
@@ -2342,6 +2343,8 @@ class SendEmailBlock(Block):
class FileType(StrEnum): class FileType(StrEnum):
CSV = "csv" CSV = "csv"
EXCEL = "excel"
PDF = "pdf"
class FileParserBlock(Block): class FileParserBlock(Block):
@@ -2349,6 +2352,7 @@ class FileParserBlock(Block):
file_url: str file_url: str
file_type: FileType file_type: FileType
json_schema: dict[str, Any] | None = None
def get_all_parameters( def get_all_parameters(
self, self,
@@ -2364,6 +2368,18 @@ class FileParserBlock(Block):
self.file_url, workflow_run_context self.file_url, workflow_run_context
) )
def _detect_file_type_from_url(self, file_url: str) -> FileType:
"""Detect file type based on file extension in the URL."""
url_lower = file_url.lower()
if url_lower.endswith((".xlsx", ".xls", ".xlsm")):
return FileType.EXCEL
elif url_lower.endswith(".pdf"):
return FileType.PDF
elif url_lower.endswith(".tsv"):
return FileType.CSV # TSV files are handled by the CSV parser
else:
return FileType.CSV # Default to CSV for .csv and any other extensions
def validate_file_type(self, file_url_used: str, file_path: str) -> None: def validate_file_type(self, file_url_used: str, file_path: str) -> None:
if self.file_type == FileType.CSV: if self.file_type == FileType.CSV:
try: try:
@@ -2371,6 +2387,121 @@ class FileParserBlock(Block):
csv.Sniffer().sniff(file.read(1024)) csv.Sniffer().sniff(file.read(1024))
except csv.Error as e: except csv.Error as e:
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e)) raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
elif self.file_type == FileType.EXCEL:
try:
# Try to read the file with pandas to validate it's a valid Excel file
pd.read_excel(file_path, nrows=1, engine="openpyxl")
except Exception as e:
raise InvalidFileType(
file_url=file_url_used, file_type=self.file_type, error=f"Invalid Excel file format: {str(e)}"
)
elif self.file_type == FileType.PDF:
try:
# Try to read the file with PyPDF to validate it's a valid PDF file
reader = PdfReader(file_path)
# Just check if we can access pages, don't read content yet
_ = len(reader.pages)
except Exception as e:
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
"""Parse CSV/TSV file and return list of dictionaries."""
parsed_data = []
with open(file_path) as file:
# Try to detect the delimiter (comma for CSV, tab for TSV)
sample = file.read(1024)
file.seek(0) # Reset file pointer
# Use csv.Sniffer to detect the delimiter
try:
dialect = csv.Sniffer().sniff(sample)
delimiter = dialect.delimiter
except csv.Error:
# Default to comma if detection fails
delimiter = ","
reader = csv.DictReader(file, delimiter=delimiter)
for row in reader:
parsed_data.append(row)
return parsed_data
def _clean_dataframe_for_json(self, df: pd.DataFrame) -> list[dict[str, Any]]:
"""Clean DataFrame to ensure it can be serialized to JSON."""
# Replace NaN and NaT values with "nan" string
df_cleaned = df.replace({pd.NA: "nan", pd.NaT: "nan"})
df_cleaned = df_cleaned.where(pd.notna(df_cleaned), "nan")
# Convert to list of dictionaries
records = df_cleaned.to_dict("records")
# Additional cleaning for any remaining problematic values
for record in records:
for key, value in record.items():
if pd.isna(value) or value == "NaN" or value == "NaT":
record[key] = "nan"
elif isinstance(value, (pd.Timestamp, pd.DatetimeTZDtype)):
# Convert pandas timestamps to ISO format strings
record[key] = value.isoformat() if pd.notna(value) else "nan"
return records
async def _parse_excel_file(self, file_path: str) -> list[dict[str, Any]]:
"""Parse Excel file and return list of dictionaries."""
try:
# Read Excel file with pandas, specifying engine explicitly
df = pd.read_excel(file_path, engine="openpyxl")
# Clean and convert DataFrame to list of dictionaries
return self._clean_dataframe_for_json(df)
except ImportError as e:
raise InvalidFileType(
file_url=self.file_url,
file_type=self.file_type,
error=f"Missing required dependency for Excel parsing: {str(e)}. Please install openpyxl: pip install openpyxl",
)
except Exception as e:
raise InvalidFileType(
file_url=self.file_url, file_type=self.file_type, error=f"Failed to parse Excel file: {str(e)}"
)
async def _parse_pdf_file(self, file_path: str) -> str:
"""Parse PDF file and return extracted text."""
try:
reader = PdfReader(file_path)
extracted_text = ""
page_count = len(reader.pages)
for i in range(page_count):
extracted_text += reader.pages[i].extract_text() + "\n"
return extracted_text
except PdfReadError as e:
raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
async def _extract_with_ai(
self, content: str | list[dict[str, Any]], workflow_run_context: WorkflowRunContext
) -> dict[str, Any]:
"""Extract structured data using AI based on json_schema."""
# Use local variable to avoid mutating the instance
schema_to_use = self.json_schema or {
"type": "object",
"properties": {
"output": {
"type": "object",
"description": "Information extracted from the file",
}
},
}
# Convert content to string for AI processing
if isinstance(content, list):
# For CSV/Excel data, convert to a readable format
content_str = json.dumps(content, indent=2)
else:
content_str = content
llm_prompt = prompt_engine.load_prompt(
"extract-information-from-file-text", extracted_text_content=content_str, json_schema=schema_to_use
)
llm_response = await app.LLM_API_HANDLER(prompt=llm_prompt, prompt_name="extract-information-from-file-text")
return llm_response
async def execute( async def execute(
self, self,
@@ -2381,6 +2512,7 @@ class FileParserBlock(Block):
**kwargs: dict, **kwargs: dict,
) -> BlockResult: ) -> BlockResult:
workflow_run_context = self.get_workflow_run_context(workflow_run_id) workflow_run_context = self.get_workflow_run_context(workflow_run_id)
if ( if (
self.file_url self.file_url
and workflow_run_context.has_parameter(self.file_url) and workflow_run_context.has_parameter(self.file_url)
@@ -2412,21 +2544,71 @@ class FileParserBlock(Block):
file_path = await download_from_s3(self.get_async_aws_client(), self.file_url) file_path = await download_from_s3(self.get_async_aws_client(), self.file_url)
else: else:
file_path = await download_file(self.file_url) file_path = await download_file(self.file_url)
# Auto-detect file type based on file extension
detected_file_type = self._detect_file_type_from_url(self.file_url)
self.file_type = detected_file_type
# Validate the file type # Validate the file type
self.validate_file_type(self.file_url, file_path) self.validate_file_type(self.file_url, file_path)
# Parse the file into a list of dictionaries where each dictionary represents a row in the file
parsed_data = [] LOG.debug(
with open(file_path) as file: "FileParserBlock: After file type validation",
if self.file_type == FileType.CSV: file_type=self.file_type,
reader = csv.DictReader(file) json_schema_present=self.json_schema is not None,
for row in reader: json_schema_type=type(self.json_schema),
parsed_data.append(row) )
# Parse the file based on type
parsed_data: str | list[dict[str, Any]]
if self.file_type == FileType.CSV:
parsed_data = await self._parse_csv_file(file_path)
elif self.file_type == FileType.EXCEL:
parsed_data = await self._parse_excel_file(file_path)
elif self.file_type == FileType.PDF:
parsed_data = await self._parse_pdf_file(file_path)
else:
return await self.build_block_result(
success=False,
failure_reason=f"Unsupported file type: {self.file_type}",
output_parameter_value=None,
status=BlockStatus.failed,
workflow_run_block_id=workflow_run_block_id,
organization_id=organization_id,
)
# If json_schema is provided, use AI to extract structured data
final_data: str | list[dict[str, Any]] | dict[str, Any]
LOG.debug(
"FileParserBlock: JSON schema check",
has_json_schema=self.json_schema is not None,
json_schema_type=type(self.json_schema),
json_schema=self.json_schema,
)
if self.json_schema:
try:
ai_extracted_data = await self._extract_with_ai(parsed_data, workflow_run_context)
final_data = ai_extracted_data
except Exception as e:
return await self.build_block_result(
success=False,
failure_reason=f"Failed to extract data with AI: {str(e)}",
output_parameter_value=None,
status=BlockStatus.failed,
workflow_run_block_id=workflow_run_block_id,
organization_id=organization_id,
)
else:
# Return raw parsed data
final_data = parsed_data
# Record the parsed data # Record the parsed data
await self.record_output_parameter_value(workflow_run_context, workflow_run_id, parsed_data) await self.record_output_parameter_value(workflow_run_context, workflow_run_id, final_data)
return await self.build_block_result( return await self.build_block_result(
success=True, success=True,
failure_reason=None, failure_reason=None,
output_parameter_value=parsed_data, output_parameter_value=final_data,
status=BlockStatus.completed, status=BlockStatus.completed,
workflow_run_block_id=workflow_run_block_id, workflow_run_block_id=workflow_run_block_id,
organization_id=organization_id, organization_id=organization_id,
@@ -2434,6 +2616,11 @@ class FileParserBlock(Block):
class PDFParserBlock(Block): class PDFParserBlock(Block):
"""
DEPRECATED: Use FileParserBlock with file_type=FileType.PDF instead.
This block will be removed in a future version.
"""
block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER
file_url: str file_url: str

View File

@@ -244,6 +244,7 @@ class FileParserBlockYAML(BlockYAML):
file_url: str file_url: str
file_type: FileType file_type: FileType
json_schema: dict[str, Any] | None = None
class PDFParserBlockYAML(BlockYAML): class PDFParserBlockYAML(BlockYAML):

View File

@@ -1926,6 +1926,7 @@ class WorkflowService:
output_parameter=output_parameter, output_parameter=output_parameter,
file_url=block_yaml.file_url, file_url=block_yaml.file_url,
file_type=block_yaml.file_type, file_type=block_yaml.file_type,
json_schema=block_yaml.json_schema,
continue_on_failure=block_yaml.continue_on_failure, continue_on_failure=block_yaml.continue_on_failure,
) )
elif block_yaml.block_type == BlockType.PDF_PARSER: elif block_yaml.block_type == BlockType.PDF_PARSER:

View File

@@ -0,0 +1,252 @@
import os
import tempfile
from datetime import datetime
from unittest.mock import MagicMock, patch
import pandas as pd
import pytest
from skyvern.forge.sdk.workflow.models.block import FileParserBlock, FileType
from skyvern.forge.sdk.workflow.models.parameter import OutputParameter
class TestFileParserBlock:
@pytest.fixture
def file_parser_block(self):
"""Create a basic FileParserBlock instance for testing."""
# Create a mock OutputParameter with all required fields
mock_output_parameter = MagicMock(spec=OutputParameter)
mock_output_parameter.parameter_type = "output"
mock_output_parameter.key = "test_output"
mock_output_parameter.output_parameter_id = "test_id"
mock_output_parameter.workflow_id = "test_workflow_id"
mock_output_parameter.created_at = datetime.now()
mock_output_parameter.modified_at = datetime.now()
mock_output_parameter.deleted_at = None
return FileParserBlock(
label="test_parser", output_parameter=mock_output_parameter, file_url="test.csv", file_type=FileType.CSV
)
@pytest.fixture
def csv_file(self):
"""Create a temporary CSV file for testing."""
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
f.write("name,age,city\nJohn,30,New York\nJane,25,Boston")
temp_file = f.name
yield temp_file
os.unlink(temp_file)
@pytest.fixture
def excel_file(self):
"""Create a temporary Excel file for testing."""
df = pd.DataFrame({"name": ["John", "Jane"], "age": [30, 25], "city": ["New York", "Boston"]})
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f:
df.to_excel(f.name, index=False)
temp_file = f.name
yield temp_file
os.unlink(temp_file)
@pytest.fixture
def tsv_file(self):
"""Create a temporary TSV file for testing."""
with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as f:
f.write("name\tage\tcity\nJohn\t30\tNew York\nJane\t25\tBoston")
temp_file = f.name
yield temp_file
os.unlink(temp_file)
def test_file_type_enum_values(self):
"""Test that FileType enum has the expected values."""
assert FileType.CSV == "csv"
assert FileType.EXCEL == "excel"
assert FileType.PDF == "pdf"
def test_file_parser_block_initialization(self, file_parser_block):
"""Test that FileParserBlock initializes correctly."""
assert file_parser_block.label == "test_parser"
assert file_parser_block.file_url == "test.csv"
assert file_parser_block.file_type == FileType.CSV
assert file_parser_block.json_schema is None
def test_file_parser_block_with_schema(self):
"""Test that FileParserBlock can be initialized with a schema."""
schema = {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}}
# Create a mock OutputParameter
mock_output_parameter = MagicMock(spec=OutputParameter)
mock_output_parameter.parameter_type = "output"
mock_output_parameter.key = "test_output"
mock_output_parameter.output_parameter_id = "test_id"
mock_output_parameter.workflow_id = "test_workflow_id"
mock_output_parameter.created_at = datetime.now()
mock_output_parameter.modified_at = datetime.now()
mock_output_parameter.deleted_at = None
block = FileParserBlock(
label="test_parser",
output_parameter=mock_output_parameter,
file_url="test.csv",
file_type=FileType.CSV,
json_schema=schema,
)
assert block.json_schema == schema
@pytest.mark.asyncio
async def test_parse_csv_file(self, file_parser_block, csv_file):
"""Test CSV file parsing."""
result = await file_parser_block._parse_csv_file(csv_file)
expected = [{"name": "John", "age": "30", "city": "New York"}, {"name": "Jane", "age": "25", "city": "Boston"}]
assert result == expected
@pytest.mark.asyncio
async def test_parse_excel_file(self, file_parser_block, excel_file):
"""Test Excel file parsing."""
result = await file_parser_block._parse_excel_file(excel_file)
expected = [{"name": "John", "age": 30, "city": "New York"}, {"name": "Jane", "age": 25, "city": "Boston"}]
assert result == expected
@pytest.mark.asyncio
async def test_parse_tsv_file(self, file_parser_block, tsv_file):
"""Test TSV file parsing."""
result = await file_parser_block._parse_csv_file(tsv_file)
expected = [{"name": "John", "age": "30", "city": "New York"}, {"name": "Jane", "age": "25", "city": "Boston"}]
assert result == expected
@pytest.mark.asyncio
async def test_validate_csv_file_type(self, file_parser_block, csv_file):
"""Test CSV file type validation."""
# Should not raise an exception
file_parser_block.validate_file_type("test.csv", csv_file)
@pytest.mark.asyncio
async def test_validate_excel_file_type(self, file_parser_block, excel_file):
"""Test Excel file type validation."""
file_parser_block.file_type = FileType.EXCEL
# Should not raise an exception
file_parser_block.validate_file_type("test.xlsx", excel_file)
@pytest.mark.asyncio
async def test_validate_invalid_csv_file(self, file_parser_block):
"""Test validation of invalid CSV file."""
# Create a binary file that's definitely not CSV
with tempfile.NamedTemporaryFile(mode="wb", suffix=".csv", delete=False) as f:
f.write(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f")
temp_file = f.name
try:
with pytest.raises(Exception):
file_parser_block.validate_file_type("test.csv", temp_file)
finally:
os.unlink(temp_file)
@pytest.mark.asyncio
async def test_extract_with_ai_with_schema(self, file_parser_block):
"""Test AI extraction with a provided schema."""
schema = {
"type": "object",
"properties": {
"extracted_data": {
"type": "object",
"properties": {
"names": {"type": "array", "items": {"type": "string"}},
"total_count": {"type": "integer"},
},
}
},
}
file_parser_block.json_schema = schema
# Mock the LLM response
mock_response = {"extracted_data": {"names": ["John", "Jane"], "total_count": 2}}
with patch("skyvern.forge.sdk.workflow.models.block.app.LLM_API_HANDLER") as mock_llm:
mock_llm.return_value = mock_response
with patch("skyvern.forge.sdk.workflow.models.block.prompt_engine.load_prompt") as mock_prompt:
mock_prompt.return_value = "mocked prompt"
result = await file_parser_block._extract_with_ai([{"name": "John"}, {"name": "Jane"}], MagicMock())
assert result == mock_response
mock_llm.assert_called_once()
mock_prompt.assert_called_once()
@pytest.mark.asyncio
async def test_extract_with_ai_without_schema(self, file_parser_block):
"""Test AI extraction without a provided schema (should use default)."""
# Mock the LLM response
mock_response = {"output": {"summary": "Extracted data from file"}}
with patch("skyvern.forge.sdk.workflow.models.block.app.LLM_API_HANDLER") as mock_llm:
mock_llm.return_value = mock_response
with patch("skyvern.forge.sdk.workflow.models.block.prompt_engine.load_prompt") as mock_prompt:
mock_prompt.return_value = "mocked prompt"
result = await file_parser_block._extract_with_ai("Some text content", MagicMock())
assert result == mock_response
# Should NOT mutate the instance - json_schema should remain None
assert file_parser_block.json_schema is None
mock_llm.assert_called_once()
mock_prompt.assert_called_once()
def test_detect_file_type_from_url(self, file_parser_block):
"""Test file type detection based on URL extension."""
# Test Excel files
assert file_parser_block._detect_file_type_from_url("https://example.com/data.xlsx") == FileType.EXCEL
assert file_parser_block._detect_file_type_from_url("https://example.com/data.xls") == FileType.EXCEL
assert file_parser_block._detect_file_type_from_url("https://example.com/data.xlsm") == FileType.EXCEL
# Test PDF files
assert file_parser_block._detect_file_type_from_url("https://example.com/document.pdf") == FileType.PDF
# Test CSV files (default)
assert file_parser_block._detect_file_type_from_url("https://example.com/data.csv") == FileType.CSV
assert file_parser_block._detect_file_type_from_url("https://example.com/data.tsv") == FileType.CSV
assert file_parser_block._detect_file_type_from_url("https://example.com/data.txt") == FileType.CSV
assert file_parser_block._detect_file_type_from_url("https://example.com/data") == FileType.CSV
def test_clean_dataframe_for_json(self, file_parser_block):
"""Test DataFrame cleaning for JSON serialization."""
# Create a DataFrame with NaN, NaT, and timestamp values
df = pd.DataFrame(
{
"OrderDate": ["2018-01-01", pd.NaT, "2018-01-03"],
"Region": ["North", "South", pd.NA],
"Sales": [1000.0, pd.NA, 3000.0],
"Timestamp": [pd.Timestamp("2018-01-01"), pd.NaT, pd.Timestamp("2018-01-03")],
}
)
# Clean the DataFrame
result = file_parser_block._clean_dataframe_for_json(df)
# Check that NaN and NaT values are converted to "nan" string
assert result[0]["OrderDate"] == "2018-01-01"
assert result[0]["Region"] == "North"
assert result[0]["Sales"] == 1000.0
assert result[0]["Timestamp"] == "2018-01-01T00:00:00"
assert result[1]["OrderDate"] == "nan"
assert result[1]["Region"] == "South"
assert result[1]["Sales"] == "nan"
assert result[1]["Timestamp"] == "nan"
assert result[2]["OrderDate"] == "2018-01-03"
assert result[2]["Region"] == "nan"
assert result[2]["Sales"] == 3000.0
assert result[2]["Timestamp"] == "2018-01-03T00:00:00"