Python+Streamlit在网页中提取PDF中文字、表格对象
优采云 发布时间: 2022-06-19 13:11Python+Streamlit在网页中提取PDF中文字、表格对象
大家好,今天给大家带来的是结合Streamlit,我们提取PDF文档中的一些内容的方法,如提取PDF的基本信息、文本信息、表格。
实现效果实现代码
<p>import streamlit as st<br />import pdfplumber<br />import io<br />from pandas import DataFrame<br />import pandas as pd<br />import fitz<br />import streamlit.components.v1 as components<br />st.set_page_config(page_title="操作PDF", layout="wide")<br /><br />css = """<br />#MainMenu {visibility:hidden;}<br />footer {visibility:hidden;}<br /><br />.stDownloadButton>button {<br /> background-color: #0099ff;<br /> color:#ffffff;<br />}<br /><br />.stDownloadButton>button:hover {<br /> background-color: #00ff00;<br /> color:#ff0000;<br /> }<br /><br />"""<br />st.markdown(css, unsafe_allow_html=True)<br /><br />def convert_df(df):<br /> st.download_button(<br /> label="点我下载表格",<br /> data=df.to_csv().encode('gbk'),<br /> file_name='table.csv',<br /> mime='text/csv',<br /> )<br /><br />def draw_table(df, theme, table_height):<br /> columns = df.columns<br /> thead1=""""""<br /> thead_temp = []<br /> for k in range(len(list(columns))):<br /> thead_temp.append(""""""+str(list(columns)[k])+"""""")<br /> header = thead1+"".join(thead_temp)+""""""<br /> rows = []<br /> rows_temp = []<br /> for i in range(df.shape[0]):<br /> rows.append(""""""+str(i+1)+"""""")<br /> rows_temp.append(df.iloc[i].values.tolist())<br /> td_temp = []<br /> for j in range(len(rows_temp)):<br /> for m in range(len(rows_temp[j])):<br /> td_temp.append(""""""+str(rows_temp[j][m])+"""""")<br /> td_temp2 = []<br /> for n in range(len(td_temp)):<br /> td_temp2.append(td_temp[n:n+df.shape[1]])<br /> td_temp3 = []<br /> for x in range(len(td_temp2)):<br /> if int(x % (df.shape[1])) == 0:<br /> td_temp3.append(td_temp2[x])<br /> td_temp4 = []<br /> for y in range(len(td_temp3)):<br /> td_temp4.append("".join(td_temp3[y]))<br /> td_temp5 = []<br /> for v in range(len(td_temp4)):<br /> td_temp5.append(""""""+str(v+1)+""""""+str(td_temp4[v])+"""""")<br /> table_html = """"""+\<br /> """"""+\<br /> """