I have this script for cleaning up SVG files with Python and lxml. It removes invisible elements and tries to solve a few selected namespace issues:
from lxml import etree
path = '/image.svg'
svg_xml = open(path, 'r').read()
# resolve problematic namespace issues
# remove specific and undefined Illustrator tags
if '<i:pgf></i:pgf>' in svg_xml:
svg_xml = svg_xml.replace('<i:pgf></i:pgf>', '')
# make sure the xmlns:xlink URL is correct
if 'xmlns:xlink' in svg_xml:
parts = svg_xml.split('xmlns:xlink', 1)
svg_xml = parts[0] + 'xmlns:xlink="http://www.w3.org/1999/xlink"' + parts[1].split('"', 1)[1].split('"', 1)[1]
# iterate over tree and remove invisible elements
tree = etree.fromstring(svg_xml)
for action, el in etree.iterwalk(tree):
style = el.attrib.get('style', '').replace(' ', '').replace(' ', '')+';'
if 'display:none;' in style or 'visibility:hidden;' in style or 'opacity:0;' in style or el.attrib.get('display', None) == 'none' or el.attrib.get('visibility', None) == 'hidden' or el.attrib.get('opacity', None) == '0':
el.clear()
# remove any external namespace attributes on elements
for k, v in el.attrib.items():
if '{' in k: del el.attrib[k]
# output cleaned SVG XML
with open(path+'_out.svg', 'w') as f:
f.write(etree.tostring(tree, xml_declaration=True, encoding="utf-8")
The problem I'm having, is that there are so many different namespace variations and problems, it's nonsensical to do a selective approach as shown in the script. However, with improperly defined namespaces (e.g. invalid namespace URLs), lxml cannot parse the input SVG file.
Is there any way to make lxml ignore any additional namespace except the SVG namespace itself? All included external namespace tags and attributes should be removed with the output.
I tried using Scour: https://github.com/scour-project/scour But to no avail, because Scour is not able to parse such SVG files either.
Regex possibly? Probably not - because XML is pretty complex and unreliable concerning it's exact notation ... Are there any lxml internal tools or parameters?
Here's an example for an invalid SVG XML:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg s:docbase="/home/bersace/Pictures/Scoutisme" s:docname="B ret.svg" s:version="0.32" viewBox="0 0 556.25 256.25" xmlns="http://www.w3.org/2000/svg" xmlns:s="http://www.w3.org/1999/xlink" xmlns:xlink="http://www.w3.org/1999/xlink">
<s:namedview/>
<path d="m435,80.89286c18.20859,3.73299 50.13173,12.65638 67.14286,20.71428 13.57142,6.42857 42.83766,41.17202-5.00001,77.14286-89.8841,67.58704-232.05431,64.80578-412.14285,36.42857-22.83085-3.59754-85.16064-33.91067-75.71429-49.28572 80.89918-131.67281 271.9943-116.51449 425.71429-84.99999z" fill-rule="evenodd" opacity=".25" s:nodetypes="csssss"/>
<path d="m13.4699,166.18218c28.75961,64.7995 429.79079,100.72443 480.68709-36.74779 2.06237-54.19359 2.91595-55.01937-9.4262-70.58465-9.03852-11.39891-24.13396-41.15211-242.82689-53.95408-178.78679,37.78592-211.02246,52.34426-234.76105,97.77919-.81311,27.05037-.67803,42.074 6.32705,63.50732z" fill="#070862" fill-rule="evenodd" s:nodetypes="ccsccc"/>
<path d="m10.19434,112.29846c3.91434.65709 6.38052,26.46846 19.34078,27.06078s19.27174,4.15885 38.85639,4.43406 17.52909,16.16712 41.31658,15.87286c11.89374-.14714 21.33872,7.45759 45.6492,2.10245s40.98649,5.9727 53.77089,5.41466 31.81617-10.79664 51.48218-16.8838 42.46628,8.04854 54.93057,6.9534 19.67291-6.25143 43.64167-15.85925c11.98438-4.80391 30.30165,3.92215 44.84665-1.01347 14.54499-4.93562 25.31771-23.53292 30.78441-24.41212 21.86682-3.5168 15.552-4.80337 32.68503-3.88397 27.31975,1.46605 11.80078-48.36064 14.8324-58.55582 4.9895,17.6129 12.93865,59.66177 11.8977,74.70034-49.74618,130.39693-425.39444,107.1176-477.41154,40.0202-4.61312-17.9437-8.62367-14.94269-6.62291-55.95032zm222.30789-68.26659c27.39052-1.88763 12.00196-35.24144 2.02031-29.03214 3.91434-3.28539 9.12727-3.3225 13.42041-.43134 1.26269,6.30798 5.47816,24.84383 3.58412,30.23188-5.42957.13142-14.22662.28293-19.02484-.7684z" fill-rule="evenodd" opacity=".25" s:nodetypes="csssssssssssccccccccc"/>
<path d="m233.47867,18.74973c-.46561-3.08113 3.68712-6.56396 13.06927-4.73874-15.26558,5.70995-19.4256,34.47406-1.17011,31.44565-.44475,2.10605-10.47125.86606-13.91421-.82491-2.0126-3.14077 1.38703-14.58719 2.01505-25.88201zm5.1118-12.49736c-101.50445,19.59281-206.83592,38.60533-230.15199,97.10613-2.0203,15.90441 3.74078,58.01244 12.51285,63.86709-38.25658-130.05695 86.15193-114.47802 217.63914-160.97322z" fill="#fff" fill-rule="evenodd" opacity=".5" s:nodetypes="ccccccccc"/>
<g transform="translate(-22.9797,-245.7773)">
<g id="g38115" transform="matrix(.70393,.16575,-.16671,.70801,133.288,-21.83065)">
<path d="m477.30206,493.23765-6.75178,17.82766-15.57729,6.28712 13.71989,10.36034-.53699,17.39822 16.39784-11.75435 18.00625,4.30619-5.84229-15.48704 9.3433-16.80939-17.55949,1.71595-11.19944-13.8447z" fill="#b9b9b9" fill-rule="evenodd" s:nodetypes="ccccccccccc"/>
<path d="m481.12834,519.01287-10.12694-7.58004 1.87984-5.33635 8.2471,12.91639zm-3.69907-24.4987-2.66818,6.36723 6.42789,17.82826-3.75971-24.19549zm3.82034,4.60866 4.72995,5.51827-4.3661,13.88664-.36385-19.40491zm6.97364,8.36837-6.3066,11.09717 12.00679-11.58229-5.70019.48512zm10.67269-.84897-16.61544,12.30999 22.92205-13.03767-6.30661.72768zm3.65778,4.49929c-.24256.6064-20.21258,8.17454-20.21258,8.17454l16.49081-1.73997 3.72177-6.43457zm-6.3066,10.98669-13.48151-2.69087 15.45064,7.94464-1.96913-5.25377zm-13.7847-2.38767 16.91865,11.21847 2.54689,6.12467-19.46554-17.34314zm-.12128.36385 13.60612,15.58792-6.08672-1.64065-7.5194-13.94727zm-.18192.24255 2.18305,12.37063-5.82147,4.30548 3.63842-16.67611zm-.1516.18193c-1.09152,1.33409-7.671,19.52619-7.671,19.52619l-5.63955,4.00226 13.31055-23.52845zm-.75801.06064-12.44392,17.8863 .25521-6.304 12.18871-11.5823zm-.27288-.39416-11.89647,7.0806-5.09412-3.77444 16.99059-3.30616zm-.15159-.48512-24.74127-2.3953 4.67793,3.65009 20.06334-1.25479zm.15159-.33353-19.5498-4.08224 5.39029-2.06178 14.15951,6.14402z" fill="#fff" fill-rule="evenodd" s:nodetypes="cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"/>
</g>
<use height="1052.3622" transform="matrix(.90926,-.41622,.41622,.90926,-297.7042,208.1211)" width="744.09448" xlink:href="#g38115"/>
</g>
<g transform="translate(5.25997,47.6715)">
<path d="m241.04605,107.8314c24.25866,4.26598 51.93743-1.59371 77.85714-1.42857 1.46124,19.59676 7.23509,45.97447.71429,64.28572-21.57443,4.78399-50.71875,3.24306-74.28571,3.57143 .87109-21.94635-.9516-44.82805-4.28572-66.42858z" fill="#00016c" fill-rule="evenodd" s:nodetypes="ccccc"/>
<path d="m281.04604,112.1171c34.82717-2.22788 46.44207,56.16844 5,58.57143-41.68623,2.41714-39.7009-56.34355-5-58.57143z" fill="#023899" fill-rule="evenodd" s:nodetypes="czz" stroke="#a11616" stroke-width="4"/>
<path d="m272.75904,126.82348c3.06049-4.46123 9.33046-9.65252 19.9163-1.41589 1.687-1.27382 6.87603-2.21719 9.14887-2.72227-.0276,3.47152-1.47516,6.99142-3.41975,10.08666 .25439,1.12734 4.07589,3.57654 4.14847,4.86439-.81914,1.54313-1.17594,1.43689-2.06881,4.91567 .93909-.01085 1.23508.18625 1.90532.19752 .56885.31192-.47249,5.23386-.15984,4.9698 .3715-.23463-2.20399.78806-2.50307.92392-.04069.0317 1.60307.67562 1.87848.96484 .13959.44222-1.39805,4.98134-1.67761,5.10936 .25668.20685-1.05701-.56466-2.55979-.02405 .77297.23524 1.82785,1.17471 1.85848,1.47926 .00908,1.1814-2.3771,3.46833-3.75885,6.00804 0,0-.06444-1.24936-.91143-1.89153-.00774,1.57867-7.21994,8.05356-8.9608,7.95412-1.46678-.08378-9.97361-4.29877-10.75465-6.78522-.34599.84273-.52289.6486-.36828,1.75051-.38583-.24304-3.80832-2.6843-4.06419-3.58841-.36033.1907.73437-2.39947.52619-3.12779-.87638,1.70195-1.59672,2.05233-1.95273,1.90746-.83333-1.39881-1.44397-2.60192-2.81302-4.44716 .55779-1.19047 1.41083-.73572 1.58037-2.37262-1.01353.37289-1.67741.61717-2.4348.77813-.80357-1.0119-.44542-5.14337-.78227-7.51222 .63518-.84515 1.70798.57015 2.42862-.50172-1.89559-1.97631-1.72953-3.09504-2.3499-4.5929 .37686-.44924 3.27154-1.89462 3.19102-3.01856-2.38095-3.33333-4.21857-8.09785-4.75428-12.02642 2.38095.20833 6.7824-.85703 9.71195,2.11708zm16.35073,14.66785c.91371.21086 5.33127-1.88173 5.77556-2.91841 .27-.62998-2.38715-2.0617-3.4433-1.45024-1.20576.69807-2.96617,4.22236-2.33226,4.36865zm-15.80094-1.74423c.42262.91568 5.33381,2.49133 6.32211,2.09953 .91802-.36394-2.30058-3.54426-3.46869-4.01151-1.21284-.48514-3.14509,1.28004-2.85342,1.91198zm7.95197,18.66963c0,0 1.82828,2.06789 2.3965.80519 .32047-.71214-2.27023-.86832-2.3965-.80519zm8.64682-.39436c0,0-.95825,1.92926-1.45209,1.07328-.94703-1.64149 1.38896-1.13642 1.45209-1.07328z" fill="#a11616" fill-rule="evenodd" s:nodetypes="ccccccccccccscccccccsccccssccssccsccsc"/>
</g>
</svg>
Update
Here's a solution that works okay, but probably not the best way to go:
namespace_pattern = re.compile(r' xmlns:((?!xlink)\w+)="[^"]*"', re.S | re.I)
doctype_pattern = re.compile(r'<!DOCTYPE[^>[]*(\[[^]]*\])?>', re.S | re.I)
svg_xml = open(path, 'r').read().replace(' xmlns:xml="http://www.w3.org/XML/1998/namespace"', '')
svg_xml = re.sub(namespace_pattern, ' xmlns:\\1="http://ns\\1"', svg_xml)
if 'xmlns:xlink' in svg_xml:
parts = svg_xml.split('xmlns:xlink', 1)
svg_xml = parts[0] + 'xmlns:xlink="http://www.w3.org/1999/xlink"' + parts[1].split('"', 1)[1].split('"', 1)[1]
if '<i:pgf></i:pgf>' in svg_xml:
svg_xml = svg_xml.replace('<i:pgf></i:pgf>', '')
if '<!DOCTYPE ' in svg_xml:
svg_xml = re.sub(doctype_pattern, '', svg_xml)
tree = etree.fromstring(svg_xml)
for action, el in etree.iterwalk(tree):
for k, v in el.attrib.items():
if '{' in k and not '/xlink' in k: del el.attrib[k]
if not '/svg' in el.tag:
el.getparent().remove(el)
style = el.attrib.get('style', '').replace(' ', '').replace(' ', '')+';'
if 'display:none;' in style or 'visibility:hidden;' in style or 'opacity:0;' in style or el.attrib.get('display', None) == 'none' or el.attrib.get('visibility', None) == 'hidden' or el.attrib.get('opacity', None) == '0':
el.getparent().remove(el)
with open(path+'_out.svg', 'w') as f:
f.write(re.sub(namespace_pattern, '', etree.tostring(tree, xml_declaration=True, encoding="utf-8")))