Source code for eppy.results.fasthtml

# Copyright (c) 2020 Santosh Philip
# =======================================================================
#  Distributed under the MIT License.
#  (See accompanying file LICENSE or copy at
#  http://opensource.org/licenses/MIT)
# =======================================================================
# -*- coding: utf-8 -*-
"""functions to do a fast read from the E+ HTML table file"""
# TODO : move this to eppy.readhtml
# TODO Document it in user documentation.
import copy
from io import StringIO

from eppy.results import readhtml


def _decodeline(line, encoding="utf-8"):
    """decodes bytes to string, if line is not bytes, line is returned

    It will first attempt to decode line with value of `encoding`. If that fails, it will try with encoding="ISO-8859-2". If that fails, it will return line.

    Why is it trying encoding="ISO-8859-2". Looks like E+ uses this encoding in some example files and which is then output in the HTML file

    # TODO this code looks fragile. Maybe use standard library HTML parse to deal with encoding?

    Parameters
    ----------
    line : str, bytes
    encoding : str

    Returns
    -------
    line : str
        decoded line
    """
    try:
        return line.decode(encoding)
    except (AttributeError, UnicodeDecodeError) as e:
        if e.__class__ == UnicodeDecodeError:
            # encoding could be ISO-8859-2 in e+ html
            return _decodeline(line, encoding="ISO-8859-2")
        else:
            return line



[docs]
def getnexttable(fhandle):
    """get the next table in the html file

    Continues to read the file line by line and collects lines from the start of the next table until the end of the table

    Parameters
    ----------
    fhandle : file like object
        A file handle to the E+ HTML table file

    Returns
    -------
    table : str
        The table in HTML format
    """
    lines = fhandle
    tablelines = []
    for line in lines:
        line = _decodeline(line)
        if line.strip().startswith("<table"):
            tablelines.append(line)
            break
    for line in lines:
        line = _decodeline(line)
        tablelines.append(line)
        if line.strip().startswith("</table"):
            break
    return "".join(tablelines)




[docs]
def tablebyname(filehandle, header):
    """fast extraction of the table using the header to identify the table

    This function reads only one table from the HTML file. This is in contrast to `results.readhtml.titletable` that will read all the tables into memory and allows you to interactively look thru them. The function `results.readhtml.titletable` can be very slow on large HTML files.

    This function is useful when you know which file you are looking for. It looks for the title line that is in bold just before the table. Some tables don't have such a title in bold. This function will not work for tables that don't have a title in bold

    Parameters
    ----------
    fhandle : file like object
        A file handle to the E+ HTML table file
    header: str
        This is the title of the table you are looking for

    Returns
    -------
    titleandtable : (str, list)
        - (title, table)
            - title = previous item with a <b> tag
            - table = rows -> [[cell1, cell2, ..], [cell1, cell2, ..], ..]
    """
    htmlheader = f"<b>{header}</b><br><br>"

    with filehandle:
        for line in filehandle:
            line = _decodeline(line)
            if line.strip() == htmlheader:
                justtable = getnexttable(filehandle)
                thetable = f"{htmlheader}\n{justtable}"
                break

    filehandle = StringIO(thetable)
    htables = readhtml.titletable(filehandle)
    try:
        return list(htables[0])
    except IndexError as e:
        None




[docs]
def get_upto_nexttable(fhandle):
    """get all lines from the present location in fhandle to the end of the next table

    This function is used by `tablebyindex` to find the title for the table, which is in the lines before the table. Then it can return the title and the table

    Parameters
    ----------
    fhandle : file like object
        A file handle to the E+ HTML table file

    Returns
    -------
    lines_and_table : str
        The table in HTML format with lines before it.
    """
    lines = fhandle
    tablelines = []
    for line in lines:
        line = _decodeline(line)
        tablelines.append(line)
        if line.strip().startswith("</table"):
            break
    return "".join(tablelines)




[docs]
def tablebyindex(filehandle, index):
    """fast extraction of the table using the index to identify the table

    This function reads only one table from the HTML file. This is in contrast to `results.readhtml.titletable` that will read all the tables into memory and allows you to interactively look thru them. The function `results.readhtml.titletable` can be very slow on large HTML files.

    This function is useful when you know which file you are looking for. It does not work with negative indices, like you can in a list. If you know a way to make negative indices work, do a pull request :-)

    Parameters
    ----------
    fhandle : file like object
        A file handle to the E+ HTML table file
    index: int
        This is the index of the table you are looking for

    Returns
    -------
    titleandtable : (str, list)
        - (title, table)
            - title = previous item with a <b> tag
            - table = rows -> [[cell1, cell2, ..], [cell1, cell2, ..], ..]
    """
    with filehandle:
        tableindex = 0
        for i in range(index + 1):
            thetable = get_upto_nexttable(filehandle)
    filehandle = StringIO(thetable)
    htables = readhtml.titletable(filehandle)
    try:
        return htables[0]
    except IndexError as e:
        None
Source code for eppy.results.fasthtml

eppy

Navigation

Related Topics