Source code for common_text_features_functions

#!/usr/bin/python3                                                                                                                                                                              
# -*- coding: utf-8 -*-       

"""Module which contains functions which are used more than once in text features extraction scripts
"""

import xml.etree.ElementTree as ET
import string

[docs]def cut_xml(_x1, _y1, _x2, _y2, xml_file):
    """
    Returns text, contained in specified area (recognized rectangle in newspaper), from xml file.

    Args:
        _x1 (int) : Upper left-sided x coordinate
        _y1 (int) : Upper left-sided y coordinate
        _x2 (int) : Lower right-sided x coordinate
        _y2 (int) : Lower right-sided y coordinate
    """
    words = []
    for line in xml_file:
        type_of_line = line.split("\t")[0]
        if type_of_line == "WORD":
            line_data =line.split("\t")[1].split(" ")
            if len(line_data) >= 5:
                x1 = line_data[0]
                y1 = line_data[1]
                x2 = line_data[2]
                y2 = line_data[3]
                word = line_data[4]
                if (int(x1) > int(_x1) and int(x2) < int(_x2) and int(y1) > int(_y1) and int(y2) < int(_y2)):
                    words.append(word.strip())
    return(words)

[docs]def get_punct_amount(words_list):
    """
    Returns number of punctation in desired rectangle.

    Args:
        words_list (list) : list of words in which we need to check amount of punctation
    """

    count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))
    return sum([count(word, string.punctuation) for word in words_list])