Coverage for portality/lib/normalise.py: 88%

26 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-20 16:12 +0100

1from urllib.parse import urlparse, urlunparse, ParseResult 

2from portality import regex 

3 

4def normalise_url(url): 

5 """ 

6 Take a URL and turn it into a form which is suitable for normalised comparison with other normalised 

7 URLs. 

8 

9 The function does the following: 

10 * strips leading/trailing whitespace 

11 * validates the URL is realistic 

12 * strips the scheme (so, removes http, https, ftp, ftps, etc) 

13 

14 :param url: 

15 :return: 

16 """ 

17 if url is None: 

18 return url 

19 

20 schemes = ["http", "https", "ftp", "ftps"] 

21 url = url.strip() 

22 if url.startswith("//"): 

23 url = "http:" + url 

24 

25 if "://" not in url: 

26 url = "http://" + url 

27 

28 u = urlparse(url) 

29 

30 if u.netloc is None or u.netloc == "": 

31 raise ValueError("Could not extract a normalised URL from '{x}'".format(x=url)) 

32 

33 if u.scheme not in schemes: 

34 raise ValueError("URL must be at http(s) or ftp(s), found '{x}'".format(x=u.netloc)) 

35 

36 n = ParseResult(netloc=u.netloc, path=u.path, params=u.params, query=u.query, fragment=u.fragment, scheme='') 

37 return urlunparse(n) 

38 

39 

40def normalise_doi(doi): 

41 """ 

42 Take a DOI and turn it into a form which is suitable for normalised comparison with other normalised 

43 DOIs. 

44 

45 The function does the following: 

46 * strips leading/trailing whitespace 

47 * validates that the DOI meets the regex 

48 * extracts only the 10.xxxx portion 

49 

50 :param doi: 

51 :return: 

52 """ 

53 if doi is None: 

54 return None 

55 doi = doi.strip() 

56 norm = regex.group_match(regex.DOI_COMPILED, doi, "id") 

57 if norm is None: 

58 raise ValueError("Could not extract a normalised DOI from '{x}'".format(x=doi)) 

59 return norm