Coverage for portality/lib/normalise.py: 88%

1from urllib.parse import urlparse, urlunparse, ParseResult

2from portality import regex

4def normalise_url(url):

5 """

6 Take a URL and turn it into a form which is suitable for normalised comparison with other normalised

7 URLs.

9 The function does the following:

10 * strips leading/trailing whitespace

11 * validates the URL is realistic

12 * strips the scheme (so, removes http, https, ftp, ftps, etc)

14 :param url:

15 :return:

16 """

17 if url is None:

18 return url

20 schemes = ["http", "https", "ftp", "ftps"]

21 url = url.strip()

22 if url.startswith("//"):

23 url = "http:" + url

25 if "://" not in url:

26 url = "http://" + url

28 u = urlparse(url)

30 if u.netloc is None or u.netloc == "":

31 raise ValueError("Could not extract a normalised URL from '{x}'".format(x=url))

33 if u.scheme not in schemes:

34 raise ValueError("URL must be at http(s) or ftp(s), found '{x}'".format(x=u.netloc))

36 n = ParseResult(netloc=u.netloc, path=u.path, params=u.params, query=u.query, fragment=u.fragment, scheme='')

37 return urlunparse(n)

40def normalise_doi(doi):

41 """

42 Take a DOI and turn it into a form which is suitable for normalised comparison with other normalised

43 DOIs.

45 The function does the following:

46 * strips leading/trailing whitespace

47 * validates that the DOI meets the regex

48 * extracts only the 10.xxxx portion

50 :param doi:

51 :return:

52 """

53 if doi is None:

54 return None

55 doi = doi.strip()

56 norm = regex.group_match(regex.DOI_COMPILED, doi, "id")

57 if norm is None:

58 raise ValueError("Could not extract a normalised DOI from '{x}'".format(x=doi))

59 return norm