Coverage for portality/lib/normalise.py: 88%
26 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-20 16:12 +0100
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-20 16:12 +0100
1from urllib.parse import urlparse, urlunparse, ParseResult
2from portality import regex
4def normalise_url(url):
5 """
6 Take a URL and turn it into a form which is suitable for normalised comparison with other normalised
7 URLs.
9 The function does the following:
10 * strips leading/trailing whitespace
11 * validates the URL is realistic
12 * strips the scheme (so, removes http, https, ftp, ftps, etc)
14 :param url:
15 :return:
16 """
17 if url is None:
18 return url
20 schemes = ["http", "https", "ftp", "ftps"]
21 url = url.strip()
22 if url.startswith("//"):
23 url = "http:" + url
25 if "://" not in url:
26 url = "http://" + url
28 u = urlparse(url)
30 if u.netloc is None or u.netloc == "":
31 raise ValueError("Could not extract a normalised URL from '{x}'".format(x=url))
33 if u.scheme not in schemes:
34 raise ValueError("URL must be at http(s) or ftp(s), found '{x}'".format(x=u.netloc))
36 n = ParseResult(netloc=u.netloc, path=u.path, params=u.params, query=u.query, fragment=u.fragment, scheme='')
37 return urlunparse(n)
40def normalise_doi(doi):
41 """
42 Take a DOI and turn it into a form which is suitable for normalised comparison with other normalised
43 DOIs.
45 The function does the following:
46 * strips leading/trailing whitespace
47 * validates that the DOI meets the regex
48 * extracts only the 10.xxxx portion
50 :param doi:
51 :return:
52 """
53 if doi is None:
54 return None
55 doi = doi.strip()
56 norm = regex.group_match(regex.DOI_COMPILED, doi, "id")
57 if norm is None:
58 raise ValueError("Could not extract a normalised DOI from '{x}'".format(x=doi))
59 return norm