Coverage for portality / lib / normalise.py: 96%

27 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-04 09:41 +0100

1from urllib.parse import urlparse, urlunparse, ParseResult 

2from portality import regex 

3 

4def normalise_url(url): 

5 """ 

6 Take a URL and turn it into a form which is suitable for normalised comparison with other normalised 

7 URLs. 

8 

9 The function does the following: 

10 * strips leading/trailing whitespace 

11 * validates the URL is realistic 

12 * strips the scheme (so, removes http, https, ftp, ftps, etc) 

13 

14 :param url: 

15 :return: 

16 """ 

17 if url is None: 

18 return url 

19 

20 schemes = ["http", "https", "ftp", "ftps"] 

21 url = url.strip() 

22 if url.startswith("//"): 

23 url = "http:" + url 

24 

25 if "://" not in url: 

26 url = "http://" + url 

27 

28 u = urlparse(url) 

29 

30 if u.netloc is None or u.netloc == "": 

31 raise ValueError("Could not extract a normalised URL from '{x}'".format(x=url)) 

32 

33 if u.scheme not in schemes: 

34 raise ValueError("URL must be at http(s) or ftp(s), found '{x}'".format(x=u.netloc)) 

35 

36 n = ParseResult(netloc=u.netloc, path=u.path, params=u.params, query=u.query, fragment=u.fragment, scheme='') 

37 return urlunparse(n) 

38 

39 

40def normalise_doi(doi): 

41 """ 

42 Take a DOI and turn it into a form which is suitable for normalised comparison with other normalised 

43 DOIs. 

44 

45 The function does the following: 

46 * strips leading/trailing whitespace 

47 * validates that the DOI meets the regex 

48 * extracts only the 10.xxxx portion 

49 * Converts the DOI to lower case. While this is not part of the standard, it is a common practice 

50 

51 :param doi: 

52 :return: 

53 """ 

54 if doi is None: 

55 return None 

56 doi = doi.strip() 

57 norm = regex.group_match(regex.DOI_COMPILED, doi, "id") 

58 if norm is None: 

59 raise ValueError("Could not extract a normalised DOI from '{x}'".format(x=doi)) 

60 norm = norm.lower() 

61 return norm