Source code for xml_extract

#!/usr/bin/python3                                                                                                                                   
# -*- coding: utf-8 -*-      

"""
Prints coordinates of paragraphs, words and lines due to given .xml file
.xml file needs to be cleaned by xml_cleaner.py
Helps in checking what is word and what is the picture fragment.
"""
import sys
import xml.etree.ElementTree as ET
import string
from common_text_features_functions import get_punct_amount
from collections import OrderedDict

para_begin_end = []
output_words_lines = []
node_list = []

[docs]def get_alpha(line):
    """
    Returns amount of alphanumeric chars.

    Args:
        line (str) : string which needs to be checked
    """
    alpha = 0
    for letter in line:
        if letter.isalpha(): alpha += 1
    return alpha

[docs]def check_paragraph(para_xml):
    """
    Checks if paragraphs contains trash, returns true if not and false if yes

    Args:
        para_xml (str) : xml of paragraph
    """
            
    root = ET.fromstring(para_xml)
    text = ""
    chars = ""
    node_list = [ele.tag for ele in root.getiterator()]

    if "WORD" in node_list:
        for word in root.iter("WORD"):
            if word.text != None: text += word.text

    if "CHARACTER" in node_list:
        for word in root.iter("CHARACTER"):
            if word.text != None: text += word.text

    else:
        for word in root.iter("LINE"):
            if word.text != None: text += word.text

    if get_alpha(text) > get_punct_amount(text): return 1
    else : return 0
    
[docs]def create_output():
    """
    Prints out the final output
    """
    sys.stdout.write("PARAGRAPH\t" + str(para_begin_end[0][0]) + " " + str(para_begin_end[0][1]) + " " + str(para_begin_end[-1][2]) + " " + str(para_begin_end[-1][3]) + "\n")
    for records in output_words_lines : sys.stdout.write(records + "\n")

[docs]def create_words_lines_output(coordinates_words):
    """
    Function which helps in making data for lines and words
    """
    coordinates = []
    for key, value in coordinates_words.items():
        coordinates.append(key)
        para_begin_end.append(key)

    output_words_lines.append("LINE\t" + str(coordinates[0][0]) + " " + str(coordinates[0][1]) + " " + str(coordinates[-1][2]) + " " + str(coordinates[-1][3]))

    for key, value in coordinates_words.items():
        keys = []
        for k in key: keys.append(k)
        for word in value.split(" "):
            output_words_lines.append("WORD\t" + ' '.join(keys) + ' ' + word)

[docs]def get_words_xml(line_xml):
    """
    Get words from xml file

    Args:
        line_xml (str) : xml of "LINE"
    """
    root = ET.fromstring(line_xml)
    coordinates_word = OrderedDict()
    node_list =  [ele.tag for ele in root.getiterator()]

    if "WORD" in node_list:
        for word in root.iter("WORD"):
            if not word: 
                coordinates = list(word.attrib.values())[0].split(',')
                x1 = coordinates[0]
                y1 = coordinates[1]
                x2 = coordinates[2]
                y2 = coordinates[3]
                if (word.text != None) :
                    coordinates_word[x1,y1,x2,y2] = word.text.lstrip().rstrip()
    
    if "CHARACTER" in node_list:
        for word in root.iter("CHARACTER"):
            if not word:
                coordinates = list(word.attrib.values())[0].split(',')
                x1 = coordinates[0]
                y1 = coordinates[1]
                x2 = coordinates[2]
                y2 = coordinates[3]
                if (word.text != None) :
                    coordinates_word[x1,y1,x2,y2] = word.text.lstrip().rstrip()

    else:
        for word in root.iter("LINE"):
            if not word:
                coordinates = list(word.attrib.values())[0].split(',')
                x1 = coordinates[0]
                y1 = coordinates[1]
                x2 = coordinates[2]
                y2 = coordinates[3]
                if (word.text != None) :
                    coordinates_word[x1,y1,x2,y2] = word.text.strip()
        
    if coordinates_word : create_words_lines_output(coordinates_word)

[docs]def get_lines_xml(para_xml):
    """
    Get lines from xml

    Args:
        para_xml (str) : xml of "PARAGRAPH"
    """
    root = ET.fromstring(para_xml)
    for line in root.iter("LINE"):
        line_xml = ET.tostring(line)
        get_words_xml(line_xml)
        
[docs]def get_paragraphs_xml(root):
    """
    Get paragraphs from xml
    """
    para_xml = ""
    for line in root.iter("PARAGRAPH"):
        para_xml = ET.tostring(line)
        if check_paragraph(para_xml):
            get_lines_xml(para_xml)
            create_output()
            para_begin_end[:] = []
            output_words_lines[:] = []

if __name__ == "__main__":

    try:
        tree_xml = ""
        for line in sys.stdin: tree_xml += line
        root = ET.fromstring(tree_xml)
    except:
        exit(0)

    get_paragraphs_xml(root)