Coverage for portality / lib / normalise.py: 96%
27 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-05 00:09 +0100
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-05 00:09 +0100
1from urllib.parse import urlparse, urlunparse, ParseResult
2from portality import regex
4def normalise_url(url):
5 """
6 Take a URL and turn it into a form which is suitable for normalised comparison with other normalised
7 URLs.
9 The function does the following:
10 * strips leading/trailing whitespace
11 * validates the URL is realistic
12 * strips the scheme (so, removes http, https, ftp, ftps, etc)
14 :param url:
15 :return:
16 """
17 if url is None:
18 return url
20 schemes = ["http", "https", "ftp", "ftps"]
21 url = url.strip()
22 if url.startswith("//"):
23 url = "http:" + url
25 if "://" not in url:
26 url = "http://" + url
28 u = urlparse(url)
30 if u.netloc is None or u.netloc == "":
31 raise ValueError("Could not extract a normalised URL from '{x}'".format(x=url))
33 if u.scheme not in schemes:
34 raise ValueError("URL must be at http(s) or ftp(s), found '{x}'".format(x=u.netloc))
36 n = ParseResult(netloc=u.netloc, path=u.path, params=u.params, query=u.query, fragment=u.fragment, scheme='')
37 return urlunparse(n)
40def normalise_doi(doi):
41 """
42 Take a DOI and turn it into a form which is suitable for normalised comparison with other normalised
43 DOIs.
45 The function does the following:
46 * strips leading/trailing whitespace
47 * validates that the DOI meets the regex
48 * extracts only the 10.xxxx portion
49 * Converts the DOI to lower case. While this is not part of the standard, it is a common practice
51 :param doi:
52 :return:
53 """
54 if doi is None:
55 return None
56 doi = doi.strip()
57 norm = regex.group_match(regex.DOI_COMPILED, doi, "id")
58 if norm is None:
59 raise ValueError("Could not extract a normalised DOI from '{x}'".format(x=doi))
60 norm = norm.lower()
61 return norm