Making file parser flexible to deprecate pdf parser (#3073)
Co-authored-by: Suchintan <suchintan@users.noreply.github.com>
This commit is contained in:
@@ -19,7 +19,7 @@ Building blocks supported today:
|
|||||||
- TextPromptBlock: A text only prompt block.
|
- TextPromptBlock: A text only prompt block.
|
||||||
- SendEmailBlock: Send an email.
|
- SendEmailBlock: Send an email.
|
||||||
- FileDownloadBlock: Given a goal, Skyvern downloads a file from the website.
|
- FileDownloadBlock: Given a goal, Skyvern downloads a file from the website.
|
||||||
- FileParserBlock: Given a file url, Skyvern downloads the file from the url, and returns the parsed content as the output of the block. Currently only support CSV file format.
|
- FileParserBlock: Given a file url, Skyvern downloads the file from the url, and returns the parsed content as the output of the block. Supports CSV, Excel, and PDF file formats.
|
||||||
- PDFParserBlock: Given a pdf url, Skyvern downloads the PDF file from the url and returns the parsed content as the output of the block.
|
- PDFParserBlock: Given a pdf url, Skyvern downloads the PDF file from the url and returns the parsed content as the output of the block.
|
||||||
- FileUploadBlock: Upload all the downloaded files to a desired destination. Currently only AWS S3 is supported. Please contact support@skyvern.com if you need more integrations.
|
- FileUploadBlock: Upload all the downloaded files to a desired destination. Currently only AWS S3 is supported. Please contact support@skyvern.com if you need more integrations.
|
||||||
- WaitBlock: Wait for a given amount of time.
|
- WaitBlock: Wait for a given amount of time.
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ This block sends an email.
|
|||||||
This block downloads a file from the website.
|
This block downloads a file from the website.
|
||||||
|
|
||||||
## FileParserBlock
|
## FileParserBlock
|
||||||
This block parses a file from the website.
|
This block parses PDFs, CSVs, and Excel files from the website.
|
||||||
|
|
||||||
## PDFParserBlock
|
## PDFParserBlock
|
||||||
This block parses a PDF file from the website.
|
This block parses a PDF file from the website.
|
||||||
|
|||||||
@@ -228,16 +228,16 @@ Inputs:
|
|||||||
|
|
||||||
Downloads and parses a file to be used within other workflow blocks.
|
Downloads and parses a file to be used within other workflow blocks.
|
||||||
|
|
||||||
**Supported types:** CSV
|
**Supported types:** CSV, TSV, Excel, PDF
|
||||||
|
|
||||||
```
|
```
|
||||||
- block_type: file_url_parser
|
- block_type: file_url_parser
|
||||||
label: csv_parser
|
label: file_parser
|
||||||
file_type: csv
|
file_type: csv # Auto-detected from URL extension
|
||||||
file_url: <csv_file_url>
|
file_url: <file_url>
|
||||||
```
|
```
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
|
|
||||||
1. **File URL *(required):*** This block allows you to use a CSV within your workflow.
|
1. **File URL *(required):*** This block allows you to use CSV, TSV, Excel, and PDF files within your workflow.
|
||||||
* Since we’re still in beta, you will need to [contact us](https://meetings.hubspot.com/skyvern/demo?uuid=7c83865f-1a92-4c44-9e52-1ba0dbc04f7a) to load a value into this block
|
* Since we’re still in beta, you will need to [contact us](https://meetings.hubspot.com/skyvern/demo?uuid=7c83865f-1a92-4c44-9e52-1ba0dbc04f7a) to load a value into this block
|
||||||
51
poetry.lock
generated
51
poetry.lock
generated
@@ -1,4 +1,4 @@
|
|||||||
# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "about-time"
|
name = "about-time"
|
||||||
@@ -1587,6 +1587,18 @@ files = [
|
|||||||
[package.extras]
|
[package.extras]
|
||||||
mypy = ["mypy"]
|
mypy = ["mypy"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "et-xmlfile"
|
||||||
|
version = "2.0.0"
|
||||||
|
description = "An implementation of lxml.xmlfile for the standard library"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
groups = ["main"]
|
||||||
|
files = [
|
||||||
|
{file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"},
|
||||||
|
{file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "exceptiongroup"
|
name = "exceptiongroup"
|
||||||
version = "1.3.0"
|
version = "1.3.0"
|
||||||
@@ -2374,7 +2386,7 @@ description = "Lightweight in-process concurrent programming"
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
markers = "python_version == \"3.12\" or python_version == \"3.13\""
|
markers = "python_version >= \"3.12\""
|
||||||
files = [
|
files = [
|
||||||
{file = "greenlet-3.2.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:1afd685acd5597349ee6d7a88a8bec83ce13c106ac78c196ee9dde7c04fe87be"},
|
{file = "greenlet-3.2.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:1afd685acd5597349ee6d7a88a8bec83ce13c106ac78c196ee9dde7c04fe87be"},
|
||||||
{file = "greenlet-3.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:761917cac215c61e9dc7324b2606107b3b292a8349bdebb31503ab4de3f559ac"},
|
{file = "greenlet-3.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:761917cac215c61e9dc7324b2606107b3b292a8349bdebb31503ab4de3f559ac"},
|
||||||
@@ -3183,7 +3195,7 @@ description = "Low-level, pure Python DBus protocol wrapper."
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
groups = ["dev"]
|
groups = ["dev"]
|
||||||
markers = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and sys_platform == \"linux\""
|
markers = "sys_platform == \"linux\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\""
|
||||||
files = [
|
files = [
|
||||||
{file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"},
|
{file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"},
|
||||||
{file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"},
|
{file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"},
|
||||||
@@ -4829,7 +4841,7 @@ description = "ONNX Runtime is a runtime accelerator for Machine Learning models
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.10"
|
python-versions = ">=3.10"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
markers = "python_version == \"3.12\" or python_version == \"3.13\""
|
markers = "python_version >= \"3.12\""
|
||||||
files = [
|
files = [
|
||||||
{file = "onnxruntime-1.22.0-cp310-cp310-macosx_13_0_universal2.whl", hash = "sha256:85d8826cc8054e4d6bf07f779dc742a363c39094015bdad6a08b3c18cfe0ba8c"},
|
{file = "onnxruntime-1.22.0-cp310-cp310-macosx_13_0_universal2.whl", hash = "sha256:85d8826cc8054e4d6bf07f779dc742a363c39094015bdad6a08b3c18cfe0ba8c"},
|
||||||
{file = "onnxruntime-1.22.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:468c9502a12f6f49ec335c2febd22fdceecc1e4cc96dfc27e419ba237dff5aff"},
|
{file = "onnxruntime-1.22.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:468c9502a12f6f49ec335c2febd22fdceecc1e4cc96dfc27e419ba237dff5aff"},
|
||||||
@@ -4937,6 +4949,21 @@ jsonschema-path = ">=0.3.1,<0.4.0"
|
|||||||
lazy-object-proxy = ">=1.7.1,<2.0.0"
|
lazy-object-proxy = ">=1.7.1,<2.0.0"
|
||||||
openapi-schema-validator = ">=0.6.0,<0.7.0"
|
openapi-schema-validator = ">=0.6.0,<0.7.0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "openpyxl"
|
||||||
|
version = "3.1.5"
|
||||||
|
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
groups = ["main"]
|
||||||
|
files = [
|
||||||
|
{file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
|
||||||
|
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
et-xmlfile = "*"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "opentelemetry-api"
|
name = "opentelemetry-api"
|
||||||
version = "1.34.1"
|
version = "1.34.1"
|
||||||
@@ -5930,7 +5957,7 @@ description = "A high-level API to automate web browsers"
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
markers = "python_version == \"3.12\" or python_version == \"3.13\""
|
markers = "python_version >= \"3.12\""
|
||||||
files = [
|
files = [
|
||||||
{file = "playwright-1.53.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:48a1a15ce810f0ffe512b6050de9871ea193b41dd3cc1bbed87b8431012419ba"},
|
{file = "playwright-1.53.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:48a1a15ce810f0ffe512b6050de9871ea193b41dd3cc1bbed87b8431012419ba"},
|
||||||
{file = "playwright-1.53.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a701f9498a5b87e3f929ec01cea3109fbde75821b19c7ba4bba54f6127b94f76"},
|
{file = "playwright-1.53.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a701f9498a5b87e3f929ec01cea3109fbde75821b19c7ba4bba54f6127b94f76"},
|
||||||
@@ -6227,7 +6254,7 @@ description = "PostgreSQL database adapter for Python"
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
markers = "python_version == \"3.12\" or python_version == \"3.11\""
|
markers = "python_version < \"3.13\""
|
||||||
files = [
|
files = [
|
||||||
{file = "psycopg-3.1.18-py3-none-any.whl", hash = "sha256:4d5a0a5a8590906daa58ebd5f3cfc34091377354a1acced269dd10faf55da60e"},
|
{file = "psycopg-3.1.18-py3-none-any.whl", hash = "sha256:4d5a0a5a8590906daa58ebd5f3cfc34091377354a1acced269dd10faf55da60e"},
|
||||||
{file = "psycopg-3.1.18.tar.gz", hash = "sha256:31144d3fb4c17d78094d9e579826f047d4af1da6a10427d91dfcfb6ecdf6f12b"},
|
{file = "psycopg-3.1.18.tar.gz", hash = "sha256:31144d3fb4c17d78094d9e579826f047d4af1da6a10427d91dfcfb6ecdf6f12b"},
|
||||||
@@ -6280,7 +6307,7 @@ description = "PostgreSQL database adapter for Python -- C optimisation distribu
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
markers = "(python_version == \"3.12\" or python_version == \"3.11\") and implementation_name != \"pypy\""
|
markers = "python_version < \"3.13\" and implementation_name != \"pypy\""
|
||||||
files = [
|
files = [
|
||||||
{file = "psycopg_binary-3.1.18-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5c323103dfa663b88204cf5f028e83c77d7a715f9b6f51d2bbc8184b99ddd90a"},
|
{file = "psycopg_binary-3.1.18-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5c323103dfa663b88204cf5f028e83c77d7a715f9b6f51d2bbc8184b99ddd90a"},
|
||||||
{file = "psycopg_binary-3.1.18-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:887f8d856c91510148be942c7acd702ccf761a05f59f8abc123c22ab77b5a16c"},
|
{file = "psycopg_binary-3.1.18-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:887f8d856c91510148be942c7acd702ccf761a05f59f8abc123c22ab77b5a16c"},
|
||||||
@@ -6731,7 +6758,7 @@ description = "A rough port of Node.js's EventEmitter to Python with a few trick
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
markers = "python_version == \"3.12\" or python_version == \"3.13\""
|
markers = "python_version >= \"3.12\""
|
||||||
files = [
|
files = [
|
||||||
{file = "pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498"},
|
{file = "pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498"},
|
||||||
{file = "pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37"},
|
{file = "pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37"},
|
||||||
@@ -7044,7 +7071,7 @@ description = "A (partial) reimplementation of pywin32 using ctypes/cffi"
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
groups = ["dev"]
|
groups = ["dev"]
|
||||||
markers = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and sys_platform == \"win32\""
|
markers = "sys_platform == \"win32\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\""
|
||||||
files = [
|
files = [
|
||||||
{file = "pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755"},
|
{file = "pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755"},
|
||||||
{file = "pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8"},
|
{file = "pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8"},
|
||||||
@@ -7784,7 +7811,7 @@ description = "Python bindings to FreeDesktop.org Secret Service API"
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
groups = ["dev"]
|
groups = ["dev"]
|
||||||
markers = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and sys_platform == \"linux\""
|
markers = "sys_platform == \"linux\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\""
|
||||||
files = [
|
files = [
|
||||||
{file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"},
|
{file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"},
|
||||||
{file = "SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77"},
|
{file = "SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77"},
|
||||||
@@ -9128,7 +9155,7 @@ description = "Fast implementation of asyncio event loop on top of libuv"
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8.0"
|
python-versions = ">=3.8.0"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
markers = "sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\""
|
markers = "platform_python_implementation != \"PyPy\" and sys_platform != \"win32\" and sys_platform != \"cygwin\""
|
||||||
files = [
|
files = [
|
||||||
{file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f"},
|
{file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f"},
|
||||||
{file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d"},
|
{file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d"},
|
||||||
@@ -9763,4 +9790,4 @@ type = ["pytest-mypy"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.1"
|
lock-version = "2.1"
|
||||||
python-versions = ">=3.11,<3.14"
|
python-versions = ">=3.11,<3.14"
|
||||||
content-hash = "667e626dd8d08bae4f9b852616a9c2c5df9bdd4a3a37f7ef87030d7bfdf51f3b"
|
content-hash = "441c7080f7fbccb87de476e56dd86d1e56e4c0b8eaa8378d13c90d94a0a42123"
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ lark = "^1.2.2"
|
|||||||
libcst = "^1.8.2"
|
libcst = "^1.8.2"
|
||||||
curlparser = "^0.1.0"
|
curlparser = "^0.1.0"
|
||||||
lmnr = {extras = ["all"], version = "^0.7.0"}
|
lmnr = {extras = ["all"], version = "^0.7.0"}
|
||||||
|
openpyxl = "^3.1.5"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
isort = "^5.13.2"
|
isort = "^5.13.2"
|
||||||
|
|||||||
@@ -10,6 +10,8 @@ import { useDebugStore } from "@/store/useDebugStore";
|
|||||||
import { cn } from "@/util/utils";
|
import { cn } from "@/util/utils";
|
||||||
import { NodeHeader } from "../components/NodeHeader";
|
import { NodeHeader } from "../components/NodeHeader";
|
||||||
import { useParams } from "react-router-dom";
|
import { useParams } from "react-router-dom";
|
||||||
|
import { WorkflowDataSchemaInputGroup } from "@/components/DataSchemaInputGroup/WorkflowDataSchemaInputGroup";
|
||||||
|
import { dataSchemaExampleForFileExtraction } from "../types";
|
||||||
|
|
||||||
function FileParserNode({ id, data }: NodeProps<FileParserNode>) {
|
function FileParserNode({ id, data }: NodeProps<FileParserNode>) {
|
||||||
const { updateNodeData } = useReactFlow();
|
const { updateNodeData } = useReactFlow();
|
||||||
@@ -21,8 +23,17 @@ function FileParserNode({ id, data }: NodeProps<FileParserNode>) {
|
|||||||
urlBlockLabel !== undefined && urlBlockLabel === label;
|
urlBlockLabel !== undefined && urlBlockLabel === label;
|
||||||
const [inputs, setInputs] = useState({
|
const [inputs, setInputs] = useState({
|
||||||
fileUrl: data.fileUrl,
|
fileUrl: data.fileUrl,
|
||||||
|
jsonSchema: data.jsonSchema,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
function handleChange(key: string, value: unknown) {
|
||||||
|
if (!data.editable) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
setInputs({ ...inputs, [key]: value });
|
||||||
|
updateNodeData(id, { [key]: value });
|
||||||
|
}
|
||||||
|
|
||||||
const isFirstWorkflowBlock = useIsFirstBlockInWorkflow({ id });
|
const isFirstWorkflowBlock = useIsFirstBlockInWorkflow({ id });
|
||||||
|
|
||||||
return (
|
return (
|
||||||
@@ -75,15 +86,19 @@ function FileParserNode({ id, data }: NodeProps<FileParserNode>) {
|
|||||||
nodeId={id}
|
nodeId={id}
|
||||||
value={inputs.fileUrl}
|
value={inputs.fileUrl}
|
||||||
onChange={(value) => {
|
onChange={(value) => {
|
||||||
if (!data.editable) {
|
handleChange("fileUrl", value);
|
||||||
return;
|
|
||||||
}
|
|
||||||
setInputs({ ...inputs, fileUrl: value });
|
|
||||||
updateNodeData(id, { fileUrl: value });
|
|
||||||
}}
|
}}
|
||||||
className="nopan text-xs"
|
className="nopan text-xs"
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
<WorkflowDataSchemaInputGroup
|
||||||
|
exampleValue={dataSchemaExampleForFileExtraction}
|
||||||
|
value={inputs.jsonSchema}
|
||||||
|
onChange={(value) => {
|
||||||
|
handleChange("jsonSchema", value);
|
||||||
|
}}
|
||||||
|
suggestionContext={{}}
|
||||||
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
import type { Node } from "@xyflow/react";
|
import type { Node } from "@xyflow/react";
|
||||||
import { NodeBaseData } from "../types";
|
import { NodeBaseData } from "../types";
|
||||||
|
import { AppNode } from "..";
|
||||||
import { debuggableWorkflowBlockTypes } from "@/routes/workflows/types/workflowTypes";
|
import { debuggableWorkflowBlockTypes } from "@/routes/workflows/types/workflowTypes";
|
||||||
|
|
||||||
export type FileParserNodeData = NodeBaseData & {
|
export type FileParserNodeData = NodeBaseData & {
|
||||||
fileUrl: string;
|
fileUrl: string;
|
||||||
|
jsonSchema: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type FileParserNode = Node<FileParserNodeData, "fileParser">;
|
export type FileParserNode = Node<FileParserNodeData, "fileParser">;
|
||||||
@@ -14,5 +16,10 @@ export const fileParserNodeDefaultData: FileParserNodeData = {
|
|||||||
label: "",
|
label: "",
|
||||||
fileUrl: "",
|
fileUrl: "",
|
||||||
continueOnFailure: false,
|
continueOnFailure: false,
|
||||||
|
jsonSchema: "null",
|
||||||
model: null,
|
model: null,
|
||||||
} as const;
|
} as const;
|
||||||
|
|
||||||
|
export function isFileParserNode(node: AppNode): node is FileParserNode {
|
||||||
|
return node.type === "fileParser";
|
||||||
|
}
|
||||||
|
|||||||
@@ -162,20 +162,19 @@ const nodeLibraryItems: Array<{
|
|||||||
/>
|
/>
|
||||||
),
|
),
|
||||||
title: "File Parser Block",
|
title: "File Parser Block",
|
||||||
description: "Parse data from files",
|
description: "Parse PDFs, CSVs, and Excel files",
|
||||||
},
|
|
||||||
{
|
|
||||||
nodeType: "pdfParser",
|
|
||||||
icon: (
|
|
||||||
<WorkflowBlockIcon
|
|
||||||
workflowBlockType={WorkflowBlockTypes.PDFParser}
|
|
||||||
className="size-6"
|
|
||||||
/>
|
|
||||||
),
|
|
||||||
title: "PDF Parser Block",
|
|
||||||
description: "Extract data from PDF files",
|
|
||||||
},
|
},
|
||||||
// {
|
// {
|
||||||
|
// nodeType: "pdfParser",
|
||||||
|
// icon: (
|
||||||
|
// <WorkflowBlockIcon
|
||||||
|
// workflowBlockType={WorkflowBlockTypes.PDFParser}
|
||||||
|
// className="size-6"
|
||||||
|
// />
|
||||||
|
// ),
|
||||||
|
// title: "PDF Parser Block",
|
||||||
|
// description: "Extract data from PDF files",
|
||||||
|
// },
|
||||||
// nodeType: "upload",
|
// nodeType: "upload",
|
||||||
// icon: (
|
// icon: (
|
||||||
// <WorkflowBlockIcon
|
// <WorkflowBlockIcon
|
||||||
|
|||||||
@@ -56,7 +56,10 @@ import { ParametersState } from "./types";
|
|||||||
import { AppNode, isWorkflowBlockNode, WorkflowBlockNode } from "./nodes";
|
import { AppNode, isWorkflowBlockNode, WorkflowBlockNode } from "./nodes";
|
||||||
import { codeBlockNodeDefaultData } from "./nodes/CodeBlockNode/types";
|
import { codeBlockNodeDefaultData } from "./nodes/CodeBlockNode/types";
|
||||||
import { downloadNodeDefaultData } from "./nodes/DownloadNode/types";
|
import { downloadNodeDefaultData } from "./nodes/DownloadNode/types";
|
||||||
import { fileParserNodeDefaultData } from "./nodes/FileParserNode/types";
|
import {
|
||||||
|
isFileParserNode,
|
||||||
|
fileParserNodeDefaultData,
|
||||||
|
} from "./nodes/FileParserNode/types";
|
||||||
import {
|
import {
|
||||||
isLoopNode,
|
isLoopNode,
|
||||||
LoopNode,
|
LoopNode,
|
||||||
@@ -468,6 +471,7 @@ function convertToNode(
|
|||||||
data: {
|
data: {
|
||||||
...commonData,
|
...commonData,
|
||||||
fileUrl: block.file_url,
|
fileUrl: block.file_url,
|
||||||
|
jsonSchema: JSON.stringify(block.json_schema, null, 2),
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -1254,7 +1258,8 @@ function getWorkflowBlock(node: WorkflowBlockNode): BlockYAML {
|
|||||||
...base,
|
...base,
|
||||||
block_type: "file_url_parser",
|
block_type: "file_url_parser",
|
||||||
file_url: node.data.fileUrl,
|
file_url: node.data.fileUrl,
|
||||||
file_type: "csv",
|
file_type: "csv", // Backend will auto-detect based on file extension
|
||||||
|
json_schema: JSONParseSafe(node.data.jsonSchema),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
case "textPrompt": {
|
case "textPrompt": {
|
||||||
@@ -2187,6 +2192,15 @@ function getWorkflowErrors(nodes: Array<AppNode>): Array<string> {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const fileParserNodes = nodes.filter(isFileParserNode);
|
||||||
|
fileParserNodes.forEach((node) => {
|
||||||
|
try {
|
||||||
|
JSON.parse(node.data.jsonSchema);
|
||||||
|
} catch {
|
||||||
|
errors.push(`${node.data.label}: Data schema is not valid JSON.`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
const waitNodes = nodes.filter(isWaitNode);
|
const waitNodes = nodes.filter(isWaitNode);
|
||||||
waitNodes.forEach((node) => {
|
waitNodes.forEach((node) => {
|
||||||
const waitTimeString = node.data.waitInSeconds.trim();
|
const waitTimeString = node.data.waitInSeconds.trim();
|
||||||
|
|||||||
@@ -354,7 +354,8 @@ export type SendEmailBlock = WorkflowBlockBase & {
|
|||||||
export type FileURLParserBlock = WorkflowBlockBase & {
|
export type FileURLParserBlock = WorkflowBlockBase & {
|
||||||
block_type: "file_url_parser";
|
block_type: "file_url_parser";
|
||||||
file_url: string;
|
file_url: string;
|
||||||
file_type: "csv";
|
file_type: "csv" | "excel" | "pdf";
|
||||||
|
json_schema: Record<string, unknown> | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ValidationBlock = WorkflowBlockBase & {
|
export type ValidationBlock = WorkflowBlockBase & {
|
||||||
|
|||||||
@@ -308,7 +308,8 @@ export type SendEmailBlockYAML = BlockYAMLBase & {
|
|||||||
export type FileUrlParserBlockYAML = BlockYAMLBase & {
|
export type FileUrlParserBlockYAML = BlockYAMLBase & {
|
||||||
block_type: "file_url_parser";
|
block_type: "file_url_parser";
|
||||||
file_url: string;
|
file_url: string;
|
||||||
file_type: "csv";
|
file_type: "csv" | "excel" | "pdf";
|
||||||
|
json_schema?: Record<string, unknown> | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ForLoopBlockYAML = BlockYAMLBase & {
|
export type ForLoopBlockYAML = BlockYAMLBase & {
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ from typing import Annotated, Any, Awaitable, Callable, Literal, Union
|
|||||||
from urllib.parse import quote, urlparse
|
from urllib.parse import quote, urlparse
|
||||||
|
|
||||||
import filetype
|
import filetype
|
||||||
|
import pandas as pd
|
||||||
import structlog
|
import structlog
|
||||||
from email_validator import EmailNotValidError, validate_email
|
from email_validator import EmailNotValidError, validate_email
|
||||||
from jinja2.sandbox import SandboxedEnvironment
|
from jinja2.sandbox import SandboxedEnvironment
|
||||||
@@ -2342,6 +2343,8 @@ class SendEmailBlock(Block):
|
|||||||
|
|
||||||
class FileType(StrEnum):
|
class FileType(StrEnum):
|
||||||
CSV = "csv"
|
CSV = "csv"
|
||||||
|
EXCEL = "excel"
|
||||||
|
PDF = "pdf"
|
||||||
|
|
||||||
|
|
||||||
class FileParserBlock(Block):
|
class FileParserBlock(Block):
|
||||||
@@ -2349,6 +2352,7 @@ class FileParserBlock(Block):
|
|||||||
|
|
||||||
file_url: str
|
file_url: str
|
||||||
file_type: FileType
|
file_type: FileType
|
||||||
|
json_schema: dict[str, Any] | None = None
|
||||||
|
|
||||||
def get_all_parameters(
|
def get_all_parameters(
|
||||||
self,
|
self,
|
||||||
@@ -2364,6 +2368,18 @@ class FileParserBlock(Block):
|
|||||||
self.file_url, workflow_run_context
|
self.file_url, workflow_run_context
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _detect_file_type_from_url(self, file_url: str) -> FileType:
|
||||||
|
"""Detect file type based on file extension in the URL."""
|
||||||
|
url_lower = file_url.lower()
|
||||||
|
if url_lower.endswith((".xlsx", ".xls", ".xlsm")):
|
||||||
|
return FileType.EXCEL
|
||||||
|
elif url_lower.endswith(".pdf"):
|
||||||
|
return FileType.PDF
|
||||||
|
elif url_lower.endswith(".tsv"):
|
||||||
|
return FileType.CSV # TSV files are handled by the CSV parser
|
||||||
|
else:
|
||||||
|
return FileType.CSV # Default to CSV for .csv and any other extensions
|
||||||
|
|
||||||
def validate_file_type(self, file_url_used: str, file_path: str) -> None:
|
def validate_file_type(self, file_url_used: str, file_path: str) -> None:
|
||||||
if self.file_type == FileType.CSV:
|
if self.file_type == FileType.CSV:
|
||||||
try:
|
try:
|
||||||
@@ -2371,6 +2387,121 @@ class FileParserBlock(Block):
|
|||||||
csv.Sniffer().sniff(file.read(1024))
|
csv.Sniffer().sniff(file.read(1024))
|
||||||
except csv.Error as e:
|
except csv.Error as e:
|
||||||
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
||||||
|
elif self.file_type == FileType.EXCEL:
|
||||||
|
try:
|
||||||
|
# Try to read the file with pandas to validate it's a valid Excel file
|
||||||
|
pd.read_excel(file_path, nrows=1, engine="openpyxl")
|
||||||
|
except Exception as e:
|
||||||
|
raise InvalidFileType(
|
||||||
|
file_url=file_url_used, file_type=self.file_type, error=f"Invalid Excel file format: {str(e)}"
|
||||||
|
)
|
||||||
|
elif self.file_type == FileType.PDF:
|
||||||
|
try:
|
||||||
|
# Try to read the file with PyPDF to validate it's a valid PDF file
|
||||||
|
reader = PdfReader(file_path)
|
||||||
|
# Just check if we can access pages, don't read content yet
|
||||||
|
_ = len(reader.pages)
|
||||||
|
except Exception as e:
|
||||||
|
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
||||||
|
|
||||||
|
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
|
||||||
|
"""Parse CSV/TSV file and return list of dictionaries."""
|
||||||
|
parsed_data = []
|
||||||
|
with open(file_path) as file:
|
||||||
|
# Try to detect the delimiter (comma for CSV, tab for TSV)
|
||||||
|
sample = file.read(1024)
|
||||||
|
file.seek(0) # Reset file pointer
|
||||||
|
|
||||||
|
# Use csv.Sniffer to detect the delimiter
|
||||||
|
try:
|
||||||
|
dialect = csv.Sniffer().sniff(sample)
|
||||||
|
delimiter = dialect.delimiter
|
||||||
|
except csv.Error:
|
||||||
|
# Default to comma if detection fails
|
||||||
|
delimiter = ","
|
||||||
|
|
||||||
|
reader = csv.DictReader(file, delimiter=delimiter)
|
||||||
|
for row in reader:
|
||||||
|
parsed_data.append(row)
|
||||||
|
return parsed_data
|
||||||
|
|
||||||
|
def _clean_dataframe_for_json(self, df: pd.DataFrame) -> list[dict[str, Any]]:
|
||||||
|
"""Clean DataFrame to ensure it can be serialized to JSON."""
|
||||||
|
# Replace NaN and NaT values with "nan" string
|
||||||
|
df_cleaned = df.replace({pd.NA: "nan", pd.NaT: "nan"})
|
||||||
|
df_cleaned = df_cleaned.where(pd.notna(df_cleaned), "nan")
|
||||||
|
|
||||||
|
# Convert to list of dictionaries
|
||||||
|
records = df_cleaned.to_dict("records")
|
||||||
|
|
||||||
|
# Additional cleaning for any remaining problematic values
|
||||||
|
for record in records:
|
||||||
|
for key, value in record.items():
|
||||||
|
if pd.isna(value) or value == "NaN" or value == "NaT":
|
||||||
|
record[key] = "nan"
|
||||||
|
elif isinstance(value, (pd.Timestamp, pd.DatetimeTZDtype)):
|
||||||
|
# Convert pandas timestamps to ISO format strings
|
||||||
|
record[key] = value.isoformat() if pd.notna(value) else "nan"
|
||||||
|
|
||||||
|
return records
|
||||||
|
|
||||||
|
async def _parse_excel_file(self, file_path: str) -> list[dict[str, Any]]:
|
||||||
|
"""Parse Excel file and return list of dictionaries."""
|
||||||
|
try:
|
||||||
|
# Read Excel file with pandas, specifying engine explicitly
|
||||||
|
df = pd.read_excel(file_path, engine="openpyxl")
|
||||||
|
# Clean and convert DataFrame to list of dictionaries
|
||||||
|
return self._clean_dataframe_for_json(df)
|
||||||
|
except ImportError as e:
|
||||||
|
raise InvalidFileType(
|
||||||
|
file_url=self.file_url,
|
||||||
|
file_type=self.file_type,
|
||||||
|
error=f"Missing required dependency for Excel parsing: {str(e)}. Please install openpyxl: pip install openpyxl",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
raise InvalidFileType(
|
||||||
|
file_url=self.file_url, file_type=self.file_type, error=f"Failed to parse Excel file: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _parse_pdf_file(self, file_path: str) -> str:
|
||||||
|
"""Parse PDF file and return extracted text."""
|
||||||
|
try:
|
||||||
|
reader = PdfReader(file_path)
|
||||||
|
extracted_text = ""
|
||||||
|
page_count = len(reader.pages)
|
||||||
|
for i in range(page_count):
|
||||||
|
extracted_text += reader.pages[i].extract_text() + "\n"
|
||||||
|
return extracted_text
|
||||||
|
except PdfReadError as e:
|
||||||
|
raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
|
||||||
|
|
||||||
|
async def _extract_with_ai(
|
||||||
|
self, content: str | list[dict[str, Any]], workflow_run_context: WorkflowRunContext
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Extract structured data using AI based on json_schema."""
|
||||||
|
# Use local variable to avoid mutating the instance
|
||||||
|
schema_to_use = self.json_schema or {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"output": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Information extracted from the file",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert content to string for AI processing
|
||||||
|
if isinstance(content, list):
|
||||||
|
# For CSV/Excel data, convert to a readable format
|
||||||
|
content_str = json.dumps(content, indent=2)
|
||||||
|
else:
|
||||||
|
content_str = content
|
||||||
|
|
||||||
|
llm_prompt = prompt_engine.load_prompt(
|
||||||
|
"extract-information-from-file-text", extracted_text_content=content_str, json_schema=schema_to_use
|
||||||
|
)
|
||||||
|
llm_response = await app.LLM_API_HANDLER(prompt=llm_prompt, prompt_name="extract-information-from-file-text")
|
||||||
|
return llm_response
|
||||||
|
|
||||||
async def execute(
|
async def execute(
|
||||||
self,
|
self,
|
||||||
@@ -2381,6 +2512,7 @@ class FileParserBlock(Block):
|
|||||||
**kwargs: dict,
|
**kwargs: dict,
|
||||||
) -> BlockResult:
|
) -> BlockResult:
|
||||||
workflow_run_context = self.get_workflow_run_context(workflow_run_id)
|
workflow_run_context = self.get_workflow_run_context(workflow_run_id)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.file_url
|
self.file_url
|
||||||
and workflow_run_context.has_parameter(self.file_url)
|
and workflow_run_context.has_parameter(self.file_url)
|
||||||
@@ -2412,21 +2544,71 @@ class FileParserBlock(Block):
|
|||||||
file_path = await download_from_s3(self.get_async_aws_client(), self.file_url)
|
file_path = await download_from_s3(self.get_async_aws_client(), self.file_url)
|
||||||
else:
|
else:
|
||||||
file_path = await download_file(self.file_url)
|
file_path = await download_file(self.file_url)
|
||||||
|
|
||||||
|
# Auto-detect file type based on file extension
|
||||||
|
detected_file_type = self._detect_file_type_from_url(self.file_url)
|
||||||
|
self.file_type = detected_file_type
|
||||||
|
|
||||||
# Validate the file type
|
# Validate the file type
|
||||||
self.validate_file_type(self.file_url, file_path)
|
self.validate_file_type(self.file_url, file_path)
|
||||||
# Parse the file into a list of dictionaries where each dictionary represents a row in the file
|
|
||||||
parsed_data = []
|
LOG.debug(
|
||||||
with open(file_path) as file:
|
"FileParserBlock: After file type validation",
|
||||||
if self.file_type == FileType.CSV:
|
file_type=self.file_type,
|
||||||
reader = csv.DictReader(file)
|
json_schema_present=self.json_schema is not None,
|
||||||
for row in reader:
|
json_schema_type=type(self.json_schema),
|
||||||
parsed_data.append(row)
|
)
|
||||||
|
|
||||||
|
# Parse the file based on type
|
||||||
|
parsed_data: str | list[dict[str, Any]]
|
||||||
|
if self.file_type == FileType.CSV:
|
||||||
|
parsed_data = await self._parse_csv_file(file_path)
|
||||||
|
elif self.file_type == FileType.EXCEL:
|
||||||
|
parsed_data = await self._parse_excel_file(file_path)
|
||||||
|
elif self.file_type == FileType.PDF:
|
||||||
|
parsed_data = await self._parse_pdf_file(file_path)
|
||||||
|
else:
|
||||||
|
return await self.build_block_result(
|
||||||
|
success=False,
|
||||||
|
failure_reason=f"Unsupported file type: {self.file_type}",
|
||||||
|
output_parameter_value=None,
|
||||||
|
status=BlockStatus.failed,
|
||||||
|
workflow_run_block_id=workflow_run_block_id,
|
||||||
|
organization_id=organization_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# If json_schema is provided, use AI to extract structured data
|
||||||
|
final_data: str | list[dict[str, Any]] | dict[str, Any]
|
||||||
|
LOG.debug(
|
||||||
|
"FileParserBlock: JSON schema check",
|
||||||
|
has_json_schema=self.json_schema is not None,
|
||||||
|
json_schema_type=type(self.json_schema),
|
||||||
|
json_schema=self.json_schema,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.json_schema:
|
||||||
|
try:
|
||||||
|
ai_extracted_data = await self._extract_with_ai(parsed_data, workflow_run_context)
|
||||||
|
final_data = ai_extracted_data
|
||||||
|
except Exception as e:
|
||||||
|
return await self.build_block_result(
|
||||||
|
success=False,
|
||||||
|
failure_reason=f"Failed to extract data with AI: {str(e)}",
|
||||||
|
output_parameter_value=None,
|
||||||
|
status=BlockStatus.failed,
|
||||||
|
workflow_run_block_id=workflow_run_block_id,
|
||||||
|
organization_id=organization_id,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Return raw parsed data
|
||||||
|
final_data = parsed_data
|
||||||
|
|
||||||
# Record the parsed data
|
# Record the parsed data
|
||||||
await self.record_output_parameter_value(workflow_run_context, workflow_run_id, parsed_data)
|
await self.record_output_parameter_value(workflow_run_context, workflow_run_id, final_data)
|
||||||
return await self.build_block_result(
|
return await self.build_block_result(
|
||||||
success=True,
|
success=True,
|
||||||
failure_reason=None,
|
failure_reason=None,
|
||||||
output_parameter_value=parsed_data,
|
output_parameter_value=final_data,
|
||||||
status=BlockStatus.completed,
|
status=BlockStatus.completed,
|
||||||
workflow_run_block_id=workflow_run_block_id,
|
workflow_run_block_id=workflow_run_block_id,
|
||||||
organization_id=organization_id,
|
organization_id=organization_id,
|
||||||
@@ -2434,6 +2616,11 @@ class FileParserBlock(Block):
|
|||||||
|
|
||||||
|
|
||||||
class PDFParserBlock(Block):
|
class PDFParserBlock(Block):
|
||||||
|
"""
|
||||||
|
DEPRECATED: Use FileParserBlock with file_type=FileType.PDF instead.
|
||||||
|
This block will be removed in a future version.
|
||||||
|
"""
|
||||||
|
|
||||||
block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER
|
block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER
|
||||||
|
|
||||||
file_url: str
|
file_url: str
|
||||||
|
|||||||
@@ -244,6 +244,7 @@ class FileParserBlockYAML(BlockYAML):
|
|||||||
|
|
||||||
file_url: str
|
file_url: str
|
||||||
file_type: FileType
|
file_type: FileType
|
||||||
|
json_schema: dict[str, Any] | None = None
|
||||||
|
|
||||||
|
|
||||||
class PDFParserBlockYAML(BlockYAML):
|
class PDFParserBlockYAML(BlockYAML):
|
||||||
|
|||||||
@@ -1926,6 +1926,7 @@ class WorkflowService:
|
|||||||
output_parameter=output_parameter,
|
output_parameter=output_parameter,
|
||||||
file_url=block_yaml.file_url,
|
file_url=block_yaml.file_url,
|
||||||
file_type=block_yaml.file_type,
|
file_type=block_yaml.file_type,
|
||||||
|
json_schema=block_yaml.json_schema,
|
||||||
continue_on_failure=block_yaml.continue_on_failure,
|
continue_on_failure=block_yaml.continue_on_failure,
|
||||||
)
|
)
|
||||||
elif block_yaml.block_type == BlockType.PDF_PARSER:
|
elif block_yaml.block_type == BlockType.PDF_PARSER:
|
||||||
|
|||||||
252
tests/unit_tests/test_file_parser_block.py
Normal file
252
tests/unit_tests/test_file_parser_block.py
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from datetime import datetime
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from skyvern.forge.sdk.workflow.models.block import FileParserBlock, FileType
|
||||||
|
from skyvern.forge.sdk.workflow.models.parameter import OutputParameter
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileParserBlock:
|
||||||
|
@pytest.fixture
|
||||||
|
def file_parser_block(self):
|
||||||
|
"""Create a basic FileParserBlock instance for testing."""
|
||||||
|
# Create a mock OutputParameter with all required fields
|
||||||
|
mock_output_parameter = MagicMock(spec=OutputParameter)
|
||||||
|
mock_output_parameter.parameter_type = "output"
|
||||||
|
mock_output_parameter.key = "test_output"
|
||||||
|
mock_output_parameter.output_parameter_id = "test_id"
|
||||||
|
mock_output_parameter.workflow_id = "test_workflow_id"
|
||||||
|
mock_output_parameter.created_at = datetime.now()
|
||||||
|
mock_output_parameter.modified_at = datetime.now()
|
||||||
|
mock_output_parameter.deleted_at = None
|
||||||
|
|
||||||
|
return FileParserBlock(
|
||||||
|
label="test_parser", output_parameter=mock_output_parameter, file_url="test.csv", file_type=FileType.CSV
|
||||||
|
)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def csv_file(self):
|
||||||
|
"""Create a temporary CSV file for testing."""
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
|
||||||
|
f.write("name,age,city\nJohn,30,New York\nJane,25,Boston")
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
yield temp_file
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def excel_file(self):
|
||||||
|
"""Create a temporary Excel file for testing."""
|
||||||
|
df = pd.DataFrame({"name": ["John", "Jane"], "age": [30, 25], "city": ["New York", "Boston"]})
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f:
|
||||||
|
df.to_excel(f.name, index=False)
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
yield temp_file
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tsv_file(self):
|
||||||
|
"""Create a temporary TSV file for testing."""
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as f:
|
||||||
|
f.write("name\tage\tcity\nJohn\t30\tNew York\nJane\t25\tBoston")
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
yield temp_file
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
def test_file_type_enum_values(self):
|
||||||
|
"""Test that FileType enum has the expected values."""
|
||||||
|
assert FileType.CSV == "csv"
|
||||||
|
assert FileType.EXCEL == "excel"
|
||||||
|
assert FileType.PDF == "pdf"
|
||||||
|
|
||||||
|
def test_file_parser_block_initialization(self, file_parser_block):
|
||||||
|
"""Test that FileParserBlock initializes correctly."""
|
||||||
|
assert file_parser_block.label == "test_parser"
|
||||||
|
assert file_parser_block.file_url == "test.csv"
|
||||||
|
assert file_parser_block.file_type == FileType.CSV
|
||||||
|
assert file_parser_block.json_schema is None
|
||||||
|
|
||||||
|
def test_file_parser_block_with_schema(self):
|
||||||
|
"""Test that FileParserBlock can be initialized with a schema."""
|
||||||
|
schema = {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}}
|
||||||
|
|
||||||
|
# Create a mock OutputParameter
|
||||||
|
mock_output_parameter = MagicMock(spec=OutputParameter)
|
||||||
|
mock_output_parameter.parameter_type = "output"
|
||||||
|
mock_output_parameter.key = "test_output"
|
||||||
|
mock_output_parameter.output_parameter_id = "test_id"
|
||||||
|
mock_output_parameter.workflow_id = "test_workflow_id"
|
||||||
|
mock_output_parameter.created_at = datetime.now()
|
||||||
|
mock_output_parameter.modified_at = datetime.now()
|
||||||
|
mock_output_parameter.deleted_at = None
|
||||||
|
|
||||||
|
block = FileParserBlock(
|
||||||
|
label="test_parser",
|
||||||
|
output_parameter=mock_output_parameter,
|
||||||
|
file_url="test.csv",
|
||||||
|
file_type=FileType.CSV,
|
||||||
|
json_schema=schema,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert block.json_schema == schema
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parse_csv_file(self, file_parser_block, csv_file):
|
||||||
|
"""Test CSV file parsing."""
|
||||||
|
result = await file_parser_block._parse_csv_file(csv_file)
|
||||||
|
|
||||||
|
expected = [{"name": "John", "age": "30", "city": "New York"}, {"name": "Jane", "age": "25", "city": "Boston"}]
|
||||||
|
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parse_excel_file(self, file_parser_block, excel_file):
|
||||||
|
"""Test Excel file parsing."""
|
||||||
|
result = await file_parser_block._parse_excel_file(excel_file)
|
||||||
|
|
||||||
|
expected = [{"name": "John", "age": 30, "city": "New York"}, {"name": "Jane", "age": 25, "city": "Boston"}]
|
||||||
|
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parse_tsv_file(self, file_parser_block, tsv_file):
|
||||||
|
"""Test TSV file parsing."""
|
||||||
|
result = await file_parser_block._parse_csv_file(tsv_file)
|
||||||
|
|
||||||
|
expected = [{"name": "John", "age": "30", "city": "New York"}, {"name": "Jane", "age": "25", "city": "Boston"}]
|
||||||
|
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_validate_csv_file_type(self, file_parser_block, csv_file):
|
||||||
|
"""Test CSV file type validation."""
|
||||||
|
# Should not raise an exception
|
||||||
|
file_parser_block.validate_file_type("test.csv", csv_file)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_validate_excel_file_type(self, file_parser_block, excel_file):
|
||||||
|
"""Test Excel file type validation."""
|
||||||
|
file_parser_block.file_type = FileType.EXCEL
|
||||||
|
# Should not raise an exception
|
||||||
|
file_parser_block.validate_file_type("test.xlsx", excel_file)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_validate_invalid_csv_file(self, file_parser_block):
|
||||||
|
"""Test validation of invalid CSV file."""
|
||||||
|
# Create a binary file that's definitely not CSV
|
||||||
|
with tempfile.NamedTemporaryFile(mode="wb", suffix=".csv", delete=False) as f:
|
||||||
|
f.write(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f")
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
file_parser_block.validate_file_type("test.csv", temp_file)
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extract_with_ai_with_schema(self, file_parser_block):
|
||||||
|
"""Test AI extraction with a provided schema."""
|
||||||
|
schema = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"extracted_data": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"names": {"type": "array", "items": {"type": "string"}},
|
||||||
|
"total_count": {"type": "integer"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
file_parser_block.json_schema = schema
|
||||||
|
|
||||||
|
# Mock the LLM response
|
||||||
|
mock_response = {"extracted_data": {"names": ["John", "Jane"], "total_count": 2}}
|
||||||
|
|
||||||
|
with patch("skyvern.forge.sdk.workflow.models.block.app.LLM_API_HANDLER") as mock_llm:
|
||||||
|
mock_llm.return_value = mock_response
|
||||||
|
|
||||||
|
with patch("skyvern.forge.sdk.workflow.models.block.prompt_engine.load_prompt") as mock_prompt:
|
||||||
|
mock_prompt.return_value = "mocked prompt"
|
||||||
|
|
||||||
|
result = await file_parser_block._extract_with_ai([{"name": "John"}, {"name": "Jane"}], MagicMock())
|
||||||
|
|
||||||
|
assert result == mock_response
|
||||||
|
mock_llm.assert_called_once()
|
||||||
|
mock_prompt.assert_called_once()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extract_with_ai_without_schema(self, file_parser_block):
|
||||||
|
"""Test AI extraction without a provided schema (should use default)."""
|
||||||
|
# Mock the LLM response
|
||||||
|
mock_response = {"output": {"summary": "Extracted data from file"}}
|
||||||
|
|
||||||
|
with patch("skyvern.forge.sdk.workflow.models.block.app.LLM_API_HANDLER") as mock_llm:
|
||||||
|
mock_llm.return_value = mock_response
|
||||||
|
|
||||||
|
with patch("skyvern.forge.sdk.workflow.models.block.prompt_engine.load_prompt") as mock_prompt:
|
||||||
|
mock_prompt.return_value = "mocked prompt"
|
||||||
|
|
||||||
|
result = await file_parser_block._extract_with_ai("Some text content", MagicMock())
|
||||||
|
|
||||||
|
assert result == mock_response
|
||||||
|
# Should NOT mutate the instance - json_schema should remain None
|
||||||
|
assert file_parser_block.json_schema is None
|
||||||
|
mock_llm.assert_called_once()
|
||||||
|
mock_prompt.assert_called_once()
|
||||||
|
|
||||||
|
def test_detect_file_type_from_url(self, file_parser_block):
|
||||||
|
"""Test file type detection based on URL extension."""
|
||||||
|
# Test Excel files
|
||||||
|
assert file_parser_block._detect_file_type_from_url("https://example.com/data.xlsx") == FileType.EXCEL
|
||||||
|
assert file_parser_block._detect_file_type_from_url("https://example.com/data.xls") == FileType.EXCEL
|
||||||
|
assert file_parser_block._detect_file_type_from_url("https://example.com/data.xlsm") == FileType.EXCEL
|
||||||
|
|
||||||
|
# Test PDF files
|
||||||
|
assert file_parser_block._detect_file_type_from_url("https://example.com/document.pdf") == FileType.PDF
|
||||||
|
|
||||||
|
# Test CSV files (default)
|
||||||
|
assert file_parser_block._detect_file_type_from_url("https://example.com/data.csv") == FileType.CSV
|
||||||
|
assert file_parser_block._detect_file_type_from_url("https://example.com/data.tsv") == FileType.CSV
|
||||||
|
assert file_parser_block._detect_file_type_from_url("https://example.com/data.txt") == FileType.CSV
|
||||||
|
assert file_parser_block._detect_file_type_from_url("https://example.com/data") == FileType.CSV
|
||||||
|
|
||||||
|
def test_clean_dataframe_for_json(self, file_parser_block):
|
||||||
|
"""Test DataFrame cleaning for JSON serialization."""
|
||||||
|
# Create a DataFrame with NaN, NaT, and timestamp values
|
||||||
|
df = pd.DataFrame(
|
||||||
|
{
|
||||||
|
"OrderDate": ["2018-01-01", pd.NaT, "2018-01-03"],
|
||||||
|
"Region": ["North", "South", pd.NA],
|
||||||
|
"Sales": [1000.0, pd.NA, 3000.0],
|
||||||
|
"Timestamp": [pd.Timestamp("2018-01-01"), pd.NaT, pd.Timestamp("2018-01-03")],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clean the DataFrame
|
||||||
|
result = file_parser_block._clean_dataframe_for_json(df)
|
||||||
|
|
||||||
|
# Check that NaN and NaT values are converted to "nan" string
|
||||||
|
assert result[0]["OrderDate"] == "2018-01-01"
|
||||||
|
assert result[0]["Region"] == "North"
|
||||||
|
assert result[0]["Sales"] == 1000.0
|
||||||
|
assert result[0]["Timestamp"] == "2018-01-01T00:00:00"
|
||||||
|
|
||||||
|
assert result[1]["OrderDate"] == "nan"
|
||||||
|
assert result[1]["Region"] == "South"
|
||||||
|
assert result[1]["Sales"] == "nan"
|
||||||
|
assert result[1]["Timestamp"] == "nan"
|
||||||
|
|
||||||
|
assert result[2]["OrderDate"] == "2018-01-03"
|
||||||
|
assert result[2]["Region"] == "nan"
|
||||||
|
assert result[2]["Sales"] == 3000.0
|
||||||
|
assert result[2]["Timestamp"] == "2018-01-03T00:00:00"
|
||||||
Reference in New Issue
Block a user