I want to list all the elements path in xml with respect to their root. for example
<A>
<B>
<C>Name</C>
<D>Name</D>
</B>
</A>
So i want to list them as :-
A/B/C
A/B/D
I am able to parse xml using "Element" Object of python but not able to extract xpath from it. Any help?
One can construct a parent map of the parsed tree and then use it to construct a needed XPath:
import xml.etree.ElementTree as parser
def get_parent_map(root):
return {c:p for p in root.iter() for c in p}
def extract_text_info(root, original_root):
parent_map = get_parent_map(original_root)
for child in root:
if child.text is not None and len(child.text.strip()) > 0:
c = child
arr = []
while c != original_root:
arr.append(c.tag)
c = parent_map[c]
arr.append(original_root.tag)
print('/'.join(arr[::-1]))
print(child.text)
extract_text_info(child, original_root)
Then we have
xml = """<A>
<B>
<C>Name</C>
<D>Name</D>
</B>
</A> """
root = parser.fromstring(xml)
extract_text_info(root, root)
> A/B/C
> Name
> A/B/D
> Name
One Of the ways I figured out is through code.
import xml.etree.ElementTree as ET
def parseXML(root,sm):
sm = sm + "/" + root.tag[root.tag.rfind('}')+1:]
for child in root:
parseXML(child,sm)
if len(list(root)) == 0:
print(sm)
tree = ET.parse('test.xml')
root = tree.getroot()
parseXML(root,"")
Don't know if there is any inbuilt function for the same.
sample.html
<A>
<B>
<C>Name1</C>
<D>Name2</D>
</B>
</A>
parse.py
from bs4 import BeautifulSoup
def get_root_elements(path_to_file):
soup = BeautifulSoup(open(path_to_file), 'lxml')
all_elements = soup.find_all()
count_element_indices = [len(list(a.parents)) for a in all_elements]
absolute_roots_index = min(
(index for index, element in enumerate(count_element_indices)
if element == max(count_element_indices)
)
)
return all_elements[absolute_roots_index:]
def get_path(element):
to_remove = ['[document]', 'body', 'html']
path = [element.name] + [e.name for e in element.parents if e.name not in to_remove]
return ' / '.join(path[::-1])
Python Shell
In [1]: file = 'path/to/sample.html'
In [2]: run parse.py
In [3]: roots = get_root_elements(file)
In [4]: print(roots)
[<c>Name1</c>, <d>Name2</d>]
In [4]: for root in roots:
...: print(get_path(root))
a / b / c
a / b / d