Source code for eppy.results.fasthtml
# Copyright (c) 2020 Santosh Philip
# =======================================================================
# Distributed under the MIT License.
# (See accompanying file LICENSE or copy at
# http://opensource.org/licenses/MIT)
# =======================================================================
# -*- coding: utf-8 -*-
"""functions to do a fast read from the E+ HTML table file"""
# TODO : move this to eppy.readhtml
# TODO Document it in user documentation.
import copy
from io import StringIO
from eppy.results import readhtml
def _decodeline(line, encoding="utf-8"):
"""decodes bytes to string, if line is not bytes, line is returned
It will first attempt to decode line with value of `encoding`. If that fails, it will try with encoding="ISO-8859-2". If that fails, it will return line.
Why is it trying encoding="ISO-8859-2". Looks like E+ uses this encoding in some example files and which is then output in the HTML file
# TODO this code looks fragile. Maybe use standard library HTML parse to deal with encoding?
Parameters
----------
line : str, bytes
encoding : str
Returns
-------
line : str
decoded line
"""
try:
return line.decode(encoding)
except (AttributeError, UnicodeDecodeError) as e:
if e.__class__ == UnicodeDecodeError:
# encoding could be ISO-8859-2 in e+ html
return _decodeline(line, encoding="ISO-8859-2")
else:
return line
[docs]def getnexttable(fhandle):
"""get the next table in the html file
Continues to read the file line by line and collects lines from the start of the next table until the end of the table
Parameters
----------
fhandle : file like object
A file handle to the E+ HTML table file
Returns
-------
table : str
The table in HTML format
"""
lines = fhandle
tablelines = []
for line in lines:
line = _decodeline(line)
if line.strip().startswith("<table"):
tablelines.append(line)
break
for line in lines:
line = _decodeline(line)
tablelines.append(line)
if line.strip().startswith("</table"):
break
return "".join(tablelines)
[docs]def tablebyname(filehandle, header):
"""fast extraction of the table using the header to identify the table
This function reads only one table from the HTML file. This is in contrast to `results.readhtml.titletable` that will read all the tables into memory and allows you to interactively look thru them. The function `results.readhtml.titletable` can be very slow on large HTML files.
This function is useful when you know which file you are looking for. It looks for the title line that is in bold just before the table. Some tables don't have such a title in bold. This function will not work for tables that don't have a title in bold
Parameters
----------
fhandle : file like object
A file handle to the E+ HTML table file
header: str
This is the title of the table you are looking for
Returns
-------
titleandtable : (str, list)
- (title, table)
- title = previous item with a <b> tag
- table = rows -> [[cell1, cell2, ..], [cell1, cell2, ..], ..]
"""
htmlheader = f"<b>{header}</b><br><br>"
with filehandle:
for line in filehandle:
line = _decodeline(line)
if line.strip() == htmlheader:
justtable = getnexttable(filehandle)
thetable = f"{htmlheader}\n{justtable}"
break
filehandle = StringIO(thetable)
htables = readhtml.titletable(filehandle)
try:
return list(htables[0])
except IndexError as e:
None
[docs]def get_upto_nexttable(fhandle):
"""get all lines from the present location in fhandle to the end of the next table
This function is used by `tablebyindex` to find the title for the table, which is in the lines before the table. Then it can return the title and the table
Parameters
----------
fhandle : file like object
A file handle to the E+ HTML table file
Returns
-------
lines_and_table : str
The table in HTML format with lines before it.
"""
lines = fhandle
tablelines = []
for line in lines:
line = _decodeline(line)
tablelines.append(line)
if line.strip().startswith("</table"):
break
return "".join(tablelines)
[docs]def tablebyindex(filehandle, index):
"""fast extraction of the table using the index to identify the table
This function reads only one table from the HTML file. This is in contrast to `results.readhtml.titletable` that will read all the tables into memory and allows you to interactively look thru them. The function `results.readhtml.titletable` can be very slow on large HTML files.
This function is useful when you know which file you are looking for. It does not work with negative indices, like you can in a list. If you know a way to make negative indices work, do a pull request :-)
Parameters
----------
fhandle : file like object
A file handle to the E+ HTML table file
index: int
This is the index of the table you are looking for
Returns
-------
titleandtable : (str, list)
- (title, table)
- title = previous item with a <b> tag
- table = rows -> [[cell1, cell2, ..], [cell1, cell2, ..], ..]
"""
with filehandle:
tableindex = 0
for i in range(index + 1):
thetable = get_upto_nexttable(filehandle)
filehandle = StringIO(thetable)
htables = readhtml.titletable(filehandle)
try:
return htables[0]
except IndexError as e:
None