con*_*der 4 image-processing computer-vision palantir-foundry
我正在探索 Palantir Foundry 平台,它似乎有大量针对矩形数据或结构化数据的选项。有人有在 Foundry 平台上处理非结构化大数据的经验吗?我们如何使用 Foundry 进行图像分析?
尽管大多数示例都是使用表格数据给出的,但实际上很多用例都使用 Foundry 进行非结构化和半结构化数据处理。\n您应该将数据集视为文件容器,并通过 API 来访问和处理文件.\n使用文件级 API,您可以访问数据集中的文件并根据需要处理它们。如果这些文件是图像,您可以从文件中提取信息并根据需要使用它。\na常见用例是将 PDF 作为数据集中的文件,并从 PDF 中提取信息并将其存储为表格信息,以便您可以执行以下操作:对其进行结构化和非结构化搜索。
\n以下是提取 PDF 的文件访问示例:
\nimport com.palantir.transforms.lang.java.api.Compute;\nimport com.palantir.transforms.lang.java.api.FoundryInput;\nimport com.palantir.transforms.lang.java.api.FoundryOutput;\nimport com.palantir.transforms.lang.java.api.Input;\nimport com.palantir.transforms.lang.java.api.Output;\nimport com.palantir.util.syntacticpath.Paths;\nimport com.google.common.collect.AbstractIterator;\nimport com.palantir.spark.binarystream.data.PortableFile;\nimport java.io.IOException;\nimport java.io.InputStream;\nimport java.util.Iterator;\nimport java.util.UUID;\nimport org.apache.spark.api.java.function.FlatMapFunction;\nimport org.apache.spark.sql.Dataset;\nimport org.apache.spark.sql.Encoders;\nimport org.apache.spark.sql.Row;\nimport org.apache.pdfbox.pdmodel.PDDocument; \nimport org.apache.pdfbox.text.PDFTextStripper;\n\n\npublic final class ExtractPDFText {\n\n private static String pdf_source_files_rid = "SOME RID";\n private static String dataProxyPath = "/foundry-data-proxy/api/dataproxy/datasets/";\n private static String datasetViewPath = "/views/master/";\n\n @Compute \n public void compute(\n @Input("/Base/project_name/treasury_pdf_docs") FoundryInput pdfFiles, \n @Output("/Base/project_name/clean/pdf_text_extracted") FoundryOutput output) throws IOException {\n\n Dataset<PortableFile> filesDataset = pdfFiles.asFiles().getFileSystem().filesAsDataset(); \n\n Dataset<String> mappedDataset = filesDataset.flatMap((FlatMapFunction<PortableFile, String>) portableFile -> \n portableFile.convertToIterator(inputStream -> {\n\n String pdfFileName = portableFile.getLogicalPath().getFileName().toString();\n return new PDFIterator(inputStream, pdfFileName);\n }), Encoders.STRING());\n\n Dataset<Row> dataset = filesDataset\n .sparkSession()\n .read()\n .option("inferSchema", "false")\n .json(mappedDataset);\n\n output.getDataFrameWriter(dataset).write();\n }\n\n private static final class PDFIterator extends AbstractIterator<String> {\n private InputStream inputStream;\n private String pdfFileName;\n private boolean done;\n\n PDFIterator(InputStream inputStream, String pdfFileName) throws IOException {\n this.inputStream = inputStream;\n this.pdfFileName = pdfFileName;\n this.done = false;\n }\n\n @Override\n protected String computeNext() {\n if (done) {\n return endOfData();\n }\n\n try {\n String objectId = pdfFileName;\n String appUrl = dataProxyPath.concat(pdf_source_files_rid).concat(datasetViewPath).concat(pdfFileName);\n PDDocument document = PDDocument.load(inputStream);\n\n PDFTextStripper pdfStripper = new PDFTextStripper();\n\n String text = pdfStripper.getText(document);\n String strippedText = text.replace("\\"", "\'").replace("\\\\", "").replace("\xe2\x80\x9c", "\'").replace("\xe2\x80\x9d", "\'").replace("\\n", "").replace("\\r", "");\n\n done = true;\n return "{\\"id\\": \\"" + String.valueOf(UUID.randomUUID()) + "\\", \\"file_name\\": \\"" + pdfFileName + "\\", \\"app_url\\": \\"" + appUrl + "\\", \\"object_id\\": \\"" + objectId + "\\", \\"text\\": \\"" + strippedText + "\\"}\\n";\n } catch (IOException e) {\n throw new RuntimeException(e);\n }\n }\n }\n} \nRun Code Online (Sandbox Code Playgroud)\n