yi_Xu
菜单
置顶

python 实现 HTML 的表格转 csv

个人需要整一张网页上的表格,不好的我需要保留网页的表格样式,我试着去提取网页中的内容,写了点代码,做为参考。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: utf-8 -*-

__author__ = 'yi_Xu'

from bs4 import BeautifulSoup
import pandas as pd
import os

class getPage(object):
    def __init__(self, src = os.path.join(".",'html', 'page1.html'), classname = "", backup_name = "", csv_name = ""):
        self.src = src
        self.classname = classname
        self.filename = os.path.split(src)[1]
        self.backup_name = (backup_name or self.filename[:-5] + '_change.html')
        self.csv_name = (csv_name or self.filename[:-5] + '_modify.csv')
        self.page_html = self._get_page()
        self.page_table = self._get_table()

    def _get_page(self, src = ""):
        with open((src or self.src), mode = "r") as f:
            html_code = f.read()
            return BeautifulSoup(html_code, "html.parser")

    def modify_backup(self, html_code, backup_src = ""):
        with open((backup_src or os.path.join(".", 'result', self.backup_name)), mode = "w", encoding="utf-8") as f:
            f.write(html_code.prettify())

    def modify_table(self, result = ""):
        page_html = self.page_html
        table = page_html.tr
        csv_src = os.path.join(".", 'csv', self.csv_name)
        df= pd.read_csv(csv_src,encoding = "utf-8", header = None)
        count = 0
        tables = []
        while (table != None):
            for column_number, child in enumerate(table.children):
                child.string = str(df.iloc[count][column_number])
            table = table.find_next_sibling("tr")
            count = count + 1
        self.modify_backup(page_html)

    def _get_table(self, csv_src = ""):
        table = self.page_html.tbody.tr
        csv_src = (csv_src or os.path.join(".", 'csv', self.csv_name))
        count = 0
        tables = []
        while (table != None):
            for column_number, child in enumerate(table.children):
                tables.append((count, column_number, child.string))
            table = table.find_next_sibling("tr")
            count = count + 1
        data= [['*' for i in range(column_number + 1)] for i in range(count + 1)]
        for key in tables:
            data[key[0]][key[1]] = key[2]
        df = pd.DataFrame(data = data)
        if os.path.exists(csv_src) == False:
            df.to_csv(csv_src,index=False, header = False, encoding = "utf-8")
        return df


if __name__ == "__main__":
    page = getPage(src = './html/page1.html')
    print(page.page_table)
    #page.modify_table(result = './result/page1.html')