我试图使用模型来提取与BeautifulSoup固定标签之间的信息这里建议在此处输入链接的描述
我有很多.html
我的文件夹中的文件,我想用BeautifulSoup脚本得到的结果保存到另一个文件夹中的单个形式.txt
文件。 这些.txt
文件应该有相同的名称,原来的文件,但将只包含提取的内容。 我写的剧本(见下文)成功处理文件,但不写提取的比特出单个文件。
import os
import glob
from bs4 import BeautifulSoup
dir_path = "C:My_folder\\tmp\\"
for file_name in glob.glob(os.path.join(dir_path, "*.html")):
my_data = (file_name)
soup = BeautifulSoup(open(my_data, "r").read())
for i in soup.select('font[color="#FF0000"]'):
print(i.text)
file_path = os.path.join(dir_path, file_name)
text = open(file_path, mode='r').read()
results = i.text
results_dir = "C:\\My_folder\\tmp\\working"
results_file = file_name[:-4] + 'txt'
file_path = os.path.join(results_dir, results_file)
open(file_path, mode='w', encoding='UTF-8').write(results)
水珠返回完整路径。 要重新打开该文件,每个font
您找到元素,替换该文件的内容。 移动在循环外的文件的开口; 你应该用文件作为背景的经理(与with
语句),以确保它们重新正常的时候关闭:
import glob
import os.path
from bs4 import BeautifulSoup
dir_path = r"C:\My_folder\tmp"
results_dir = r"C:\My_folder\tmp\working"
for file_name in glob.glob(os.path.join(dir_path, "*.html")):
with open(file_name) as html_file:
soup = BeautifulSoup(html_file)
results_file = os.path.splitext(file_name)[0] + '.txt'
with open(os.path.join(results_dir, results_file), 'w') as outfile:
for i in soup.select('font[color="#FF0000"]'):
print(i.text)
outfile.write(i.text + '\n')
import glob
import os
from BeautifulSoup import BeautifulSoup
input_dir = "/home/infogrid/Desktop/Work/stack_over/input/"
#- Already Present on system.
output_dir = "/home/infogrid/Desktop/Work/stack_over/output/"
for file_name in glob.glob(input_dir+ "*.html"):
with open(file_name) as fp:
soup = BeautifulSoup(fp)
results_file = "%s%s.txt"%(output_dir, os.path.splitext(os.path.basename(file_name))[0])
tmp = [i.text for i in soup.findAll('font') if i.get("color")=="#FF0000"]
with open(results_file, 'w') as fp:
print "\n".join(tmp)
fp.write("\n".join(tmp))