批量提取文件中内容

顺序提取:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os

path = "/root/桌面/ISO1995/" #文件夹目录
files= os.listdir(path) #得到文件夹下的所有文件名称
files.sort() #排序

txts = []

for file in files: #遍历文件夹

position = path+ file

print (position)

with open(position, "r",encoding='utf-8') as f: #打开文件

data = f.read() #读取文件

txts.append(data)

txts = ''.join(txts)#转化为非数组类型
with open('f2.txt','w') as f2: #存放到指定文件中
f2.write(txts)
print (txts)

批量提取word内容到txt:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/usr/bin/env python
# encoding: utf-8

import docx
import os

path = "/root/桌面/22/" #文件夹目录
files= os.listdir(path) #得到文件夹下的所有文件名称

file_dir = './docx'

for root, dirs, files in os.walk(file_dir, topdown=True):
pass

contents_list = []

for file in files:
file_name = path+ file
print (file_name)
file_content_list = []
file_content = docx.Document(file_name)
for para in file_content.paragraphs:
file_content_list.append(para.text)
contents_list.append(' '.join(file_content_list))


with open('f2.txt', 'w', encoding='utf8') as f:
for str_line in contents_list:
f.write(str_line + '\n')

请我喝杯咖啡吧~

支付宝
微信