Source code for xml_extract

#!/usr/bin/python3                                                                                                                                   
# -*- coding: utf-8 -*-      

"""
Prints coordinates of paragraphs, words and lines due to given .xml file
.xml file needs to be cleaned by xml_cleaner.py
Helps in checking what is word and what is the picture fragment.
"""
import sys
import xml.etree.ElementTree as ET
import string
from common_text_features_functions import get_punct_amount
from collections import OrderedDict

para_begin_end = []
output_words_lines = []
node_list = []

[docs]def get_alpha(line): """ Returns amount of alphanumeric chars. Args: line (str) : string which needs to be checked """ alpha = 0 for letter in line: if letter.isalpha(): alpha += 1 return alpha
[docs]def check_paragraph(para_xml): """ Checks if paragraphs contains trash, returns true if not and false if yes Args: para_xml (str) : xml of paragraph """ root = ET.fromstring(para_xml) text = "" chars = "" node_list = [ele.tag for ele in root.getiterator()] if "WORD" in node_list: for word in root.iter("WORD"): if word.text != None: text += word.text if "CHARACTER" in node_list: for word in root.iter("CHARACTER"): if word.text != None: text += word.text else: for word in root.iter("LINE"): if word.text != None: text += word.text if get_alpha(text) > get_punct_amount(text): return 1 else : return 0
[docs]def create_output(): """ Prints out the final output """ sys.stdout.write("PARAGRAPH\t" + str(para_begin_end[0][0]) + " " + str(para_begin_end[0][1]) + " " + str(para_begin_end[-1][2]) + " " + str(para_begin_end[-1][3]) + "\n") for records in output_words_lines : sys.stdout.write(records + "\n")
[docs]def create_words_lines_output(coordinates_words): """ Function which helps in making data for lines and words """ coordinates = [] for key, value in coordinates_words.items(): coordinates.append(key) para_begin_end.append(key) output_words_lines.append("LINE\t" + str(coordinates[0][0]) + " " + str(coordinates[0][1]) + " " + str(coordinates[-1][2]) + " " + str(coordinates[-1][3])) for key, value in coordinates_words.items(): keys = [] for k in key: keys.append(k) for word in value.split(" "): output_words_lines.append("WORD\t" + ' '.join(keys) + ' ' + word)
[docs]def get_words_xml(line_xml): """ Get words from xml file Args: line_xml (str) : xml of "LINE" """ root = ET.fromstring(line_xml) coordinates_word = OrderedDict() node_list = [ele.tag for ele in root.getiterator()] if "WORD" in node_list: for word in root.iter("WORD"): if not word: coordinates = list(word.attrib.values())[0].split(',') x1 = coordinates[0] y1 = coordinates[1] x2 = coordinates[2] y2 = coordinates[3] if (word.text != None) : coordinates_word[x1,y1,x2,y2] = word.text.lstrip().rstrip() if "CHARACTER" in node_list: for word in root.iter("CHARACTER"): if not word: coordinates = list(word.attrib.values())[0].split(',') x1 = coordinates[0] y1 = coordinates[1] x2 = coordinates[2] y2 = coordinates[3] if (word.text != None) : coordinates_word[x1,y1,x2,y2] = word.text.lstrip().rstrip() else: for word in root.iter("LINE"): if not word: coordinates = list(word.attrib.values())[0].split(',') x1 = coordinates[0] y1 = coordinates[1] x2 = coordinates[2] y2 = coordinates[3] if (word.text != None) : coordinates_word[x1,y1,x2,y2] = word.text.strip() if coordinates_word : create_words_lines_output(coordinates_word)
[docs]def get_lines_xml(para_xml): """ Get lines from xml Args: para_xml (str) : xml of "PARAGRAPH" """ root = ET.fromstring(para_xml) for line in root.iter("LINE"): line_xml = ET.tostring(line) get_words_xml(line_xml)
[docs]def get_paragraphs_xml(root): """ Get paragraphs from xml """ para_xml = "" for line in root.iter("PARAGRAPH"): para_xml = ET.tostring(line) if check_paragraph(para_xml): get_lines_xml(para_xml) create_output() para_begin_end[:] = [] output_words_lines[:] = []
if __name__ == "__main__": try: tree_xml = "" for line in sys.stdin: tree_xml += line root = ET.fromstring(tree_xml) except: exit(0) get_paragraphs_xml(root)