Coverage for portality/lib/xmlutil.py: 19%

42 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-22 15:59 +0100

1from lxml import etree 

2import re 

3 

4encoding_rx = re.compile('^<\?xml .*encoding=["\'](.+?)["\'].*\?>') 

5 

6def detect_encoding(s): 

7 m = encoding_rx.match(s) 

8 if m is None: 

9 return None 

10 return m.group(1) 

11 

12def fromstring(s): 

13 # first try and parse the string directly 

14 error = None 

15 try: 

16 return etree.fromstring(s) 

17 except ValueError as e: 

18 error = e 

19 

20 # if this failed, and this is not a unicode string, then just raise 

21 # the exception, as there's nothing more to do for now 

22 if not isinstance(s, str): 

23 raise error 

24 

25 # our next best bet is to attempt to encode the unicode to a byte-stream 

26 # with the relevant encoding 

27 enc = detect_encoding(s) 

28 if enc is not None: 

29 try: 

30 bs = s.encode(enc) 

31 return etree.fromstring(bs) 

32 except LookupError: 

33 # this means the detected encoding is junk 

34 pass 

35 except ValueError as e: 

36 # we had a problem parsing with the given encoding 

37 pass 

38 

39 # if we get here, we failed to decode or failed to parse. Let's therefore strip 

40 # the encoding declaration and see if lxml can sort it out (and just let the 

41 # error raise as necessary) 

42 clean = encoding_rx.sub("", s).strip() 

43 return etree.fromstring(clean) 

44 

45def xp_first_text(element, xpath, default=None): 

46 el = element.xpath(xpath) 

47 if len(el) > 0: 

48 return el[0].text 

49 return default 

50 

51def xp_texts(element, xpath): 

52 els = element.xpath(xpath) 

53 return [e.text for e in els if e.text is not None] 

54 

55def objectify(element): 

56 obj = {} 

57 for c in element.getchildren(): 

58 # FIXME: does not currently handle attributes 

59 #for attr in c.keys(): 

60 # obj["@" + attr] = c.get(attr) 

61 if len(c.getchildren()) > 0: 

62 obj[c.tag] = objectify(c) 

63 else: 

64 obj[c.tag] = c.text 

65 return obj