Source code for eppy.results.readhtml

# Copyright (c) 2012 Santosh Philip
# =======================================================================
#  Distributed under the MIT License.
#  (See accompanying file LICENSE or copy at
#  http://opensource.org/licenses/MIT)
# =======================================================================

"""read the html outputs"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import string
import collections
from bs4 import BeautifulSoup, NavigableString, Tag


[docs]class NotSimpleTable(Exception): """Exception Object""" pass
[docs]def tdbr2EOL(td): """convert the <br/> in <td> block into line ending (EOL = \n)""" for br in td.find_all("br"): br.replace_with("\n") txt = str(td) # make it back into test # would be unicode(id) in python2 soup = BeautifulSoup(txt, "lxml") # read it as a BeautifulSoup ntxt = soup.find("td") # BeautifulSoup has lot of other html junk. # this line will extract just the <td> block return ntxt
[docs]def is_simpletable(table): """test if the table has only strings in the cells""" tds = table("td") for td in tds: if td.contents != []: td = tdbr2EOL(td) if len(td.contents) == 1: thecontents = td.contents[0] if not isinstance(thecontents, NavigableString): return False else: return False return True
[docs]def table_withcelltag_2matrix(table): """convert a table to a list of lists - a 2D matrix but ignores tags within a cell""" # an idf object has a name like "glass <thicknessis 3mm>" # the "<thicknessis 3mm>" will be changed by soup into # "<thicknessis 3mm><</thicknessis 3mm>" # which is a tag inside the cell - so it is not a simpletable # this function will ignore the tag inside the cell rows = [] for tr in table("tr"): row = [] for td in tr("td"): td = tdbr2EOL(td) # convert any '<br>' in the td to line ending row.append(cell2txt(td)) rows.append(row) return rows
[docs]def table2matrix(table): """convert a table to a list of lists - a 2D matrix""" if not is_simpletable(table): # if it is not a simple table, it is because an idf object has a name # like "glass <thicknessis 3mm>" # the "<thicknessis 3mm>" will be changed by soup into # "<thicknessis 3mm><</thicknessis 3mm>" # which is a tag inside the cell - so it is not a simpletable # so we need another function return table_withcelltag_2matrix(table) rows = [] for tr in table("tr"): row = [] for td in tr("td"): td = tdbr2EOL(td) # convert any '<br>' in the td to line ending try: row.append(td.contents[0]) except IndexError: row.append("") rows.append(row) return rows
[docs]def table_withcelltag_2val_matrix(table): """convert a table to a list of lists - a 2D matrix Converts numbers to float but ignores tags within a cell""" # an idf object has a name like "glass <thicknessis 3mm>" # the "<thicknessis 3mm>" will be changed by soup into # "<thicknessis 3mm><</thicknessis 3mm>" # which is a tag inside the cell - so it is not a simpletable # this function will ignore the tag inside the cell rows = [] for tr in table("tr"): row = [] for td in tr("td"): td = tdbr2EOL(td) val = cell2txt(td) try: val = float(val) row.append(val) except ValueError: row.append(val) rows.append(row) return rows
[docs]def table2val_matrix(table): """convert a table to a list of lists - a 2D matrix Converts numbers to float""" if not is_simpletable(table): # raise NotSimpleTable("Not able read a cell in the table as a string") # run a different function for nonsimple table # if it is not a simple table, it is because an idf object has a name # like "glass <thicknessis 3mm>" # the "<thicknessis 3mm>" will be changed by soup into # "<thicknessis 3mm><</thicknessis 3mm>" # which is a tag inside the cell - so it is not a simpletable # so we need another function return table_withcelltag_2val_matrix(table) rows = [] for tr in table("tr"): row = [] for td in tr("td"): td = tdbr2EOL(td) try: val = td.contents[0] except IndexError: row.append("") else: try: val = float(val) row.append(val) except ValueError: row.append(val) rows.append(row) return rows
[docs]def titletable(html_doc, tofloat=True): """return a list of [(title, table), .....] title = previous item with a <b> tag table = rows -> [[cell1, cell2, ..], [cell1, cell2, ..], ..]""" soup = BeautifulSoup(html_doc, "html.parser") btables = soup.find_all(["b", "table"]) # find all the <b> and <table> titletables = [] for i, item in enumerate(btables): if item.name == "table": for j in range(i + 1): if btables[i - j].name == "b": # step back to find a <b> break titletables.append((btables[i - j], item)) if tofloat: t2m = table2val_matrix else: t2m = table2matrix titlerows = [(tl.contents[0], t2m(tb)) for tl, tb in titletables] return titlerows
def _has_name(soup_obj): """checks if soup_obj is really a soup object or just a string If it has a name it is a soup object""" try: name = soup_obj.name if name == None: return False return True except AttributeError: return False
[docs]def lines_table(html_doc, tofloat=True): """return a list of [(lines, table), .....] lines = all the significant lines before the table. These are lines between this table and the previous table or 'hr' tag table = rows -> [[cell1, cell2, ..], [cell1, cell2, ..], ..] The lines act as a description for what is in the table """ soup = BeautifulSoup(html_doc, "html.parser") linestables = [] elements = soup.p.next_elements # start after the first para for element in elements: tabletup = [] if not _has_name(element): continue if element.name == "table": # hit the first table beforetable = [] prev_elements = element.previous_elements # walk back and get the lines for prev_element in prev_elements: if not _has_name(prev_element): continue if prev_element.name not in ("br", None): # no lines here if prev_element.name in ("table", "hr", "tr", "td"): # just hit the previous table. You got all the lines break if prev_element.parent.name == "p": # if the parent is "p", you will get it's text anyways from the parent pass else: if prev_element.get_text(): # skip blank lines beforetable.append(prev_element.get_text()) beforetable.reverse() tabletup.append(beforetable) function_selector = {True: table2val_matrix, False: table2matrix} function = function_selector[tofloat] tabletup.append(function(element)) if tabletup: linestables.append(tabletup) return linestables
def _asciidigits(s): """if s is not ascii or digit, return an '_'""" if s not in string.ascii_letters + string.digits: s = "_" return s def _nospace(s): """replace all non-ascii, non_digit or space with '_'""" return "".join([_asciidigits(i) for i in s]) def _transpose(arr): return list(map(list, list(zip(*arr)))) def _make_ntgrid(grid): """make a named tuple grid [["", "a b", "b c", "c d"], ["x y", 1, 2, 3 ], ["y z", 4, 5, 6 ], ["z z", 7, 8, 9 ],] will return ntcol(x_y=ntrow(a_b=1, b_c=2, c_d=3), y_z=ntrow(a_b=4, b_c=5, c_d=6), z_z=ntrow(a_b=7, b_c=8, c_d=9))""" hnames = [_nospace(n) for n in grid[0][1:]] vnames = [_nospace(row[0]) for row in grid[1:]] vnames_s = " ".join(vnames) hnames_s = " ".join(hnames) ntcol = collections.namedtuple("ntcol", vnames_s) ntrow = collections.namedtuple("ntrow", hnames_s) rdict = [dict(list(zip(hnames, row[1:]))) for row in grid[1:]] ntrows = [ntrow(**rdict[i]) for i, name in enumerate(vnames)] ntcols = ntcol(**dict(list(zip(vnames, ntrows)))) return ntcols
[docs]def named_grid_h(grid): """make a horizontal named grid""" return _make_ntgrid(grid)
[docs]def named_grid_v(grid): """make a vertical named grid""" return _make_ntgrid(_transpose(grid))
[docs]def cell2txt(td): """clean up the td and return text in it It will ignore any tags within the td""" td = tdbr2EOL(td) lst = [] for txt in td.contents: try: a = txt.contents except AttributeError as e: lst.append(txt) return "".join(lst)