From e2ad39ca08484e78e608dd63a2ddbca662b15f85 Mon Sep 17 00:00:00 2001 From: Bill Flynn Date: Fri, 5 Jan 2018 19:22:00 -0500 Subject: [PATCH 1/4] [Fix #95] robust handling of DOIs Added DOI regex function to utils.py which is called in add_cmd.py upon specifying a new DOI. DOI validation applies directly on argument parsing by using a custom argparse.Action. --- pubs/commands/add_cmd.py | 11 +++++- pubs/utils.py | 35 ++++++++++++++++++ tests/test_doi.py | 80 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 tests/test_doi.py diff --git a/pubs/commands/add_cmd.py b/pubs/commands/add_cmd.py index 8c5db77..f17a662 100644 --- a/pubs/commands/add_cmd.py +++ b/pubs/commands/add_cmd.py @@ -1,3 +1,4 @@ +import argparse from ..uis import get_ui from .. import bibstruct from .. import content @@ -7,13 +8,21 @@ from .. import templates from .. import apis from .. import color from .. import pretty +from .. import utils +class ValidateDOI(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + doi = values + new_doi = utils.standardize_doi(doi) + if not new_doi: + raise ValueError("Not a valid doi: %s", doi) + setattr(namespace, self.dest, new_doi) def parser(subparsers, conf): parser = subparsers.add_parser('add', help='add a paper to the repository') parser.add_argument('bibfile', nargs='?', default=None, help='bibtex file') - parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None) + parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None, action=ValidateDOI) parser.add_argument('-I', '--isbn', help='isbn number to retrieve the bibtex entry, if it is not provided', default=None) parser.add_argument('-d', '--docfile', help='pdf or ps file', default=None) parser.add_argument('-t', '--tags', help='tags associated to the paper, separated by commas', diff --git a/pubs/utils.py b/pubs/utils.py index 2af0fa2..bbfeab5 100644 --- a/pubs/utils.py +++ b/pubs/utils.py @@ -1,5 +1,7 @@ # Function here may belong somewhere else. In the mean time... +import re + from . import color from . import pretty @@ -44,3 +46,36 @@ def resolve_citekey_list(repo, citekeys, ui=None, exit_on_fail=True): ui.exit() else: return keys + +def standardize_doi(doi): + """ + Given a putative doi, attempts to always return it in the form of + 10.XXXX/... Specifically designed to handle these cases: + - https://doi.org/ + - http://doi.org/ + - https://dx.doi.org/ + - http://dx.doi.org/ + - dx.doi.org/ + - doi.org/ + and attempts to verify doi adherence to DOI handbook standards and + crossref.org advice: + https://www.doi.org/doi_handbook/2_Numbering.html + https://www.crossref.org/blog/dois-and-matching-regular-expressions/""" + """ :returns standardized doi """ + doi_regexes = ( + re.compile(r'(10\.\d{4,9}/[-._;()/:A-z0-9\>\<]+)'), + re.compile(r'(10.1002/[^\s]+)'), + re.compile(r'(10\.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d)'), + re.compile(r'(10\.1021/\w\w\d+\+)'), + re.compile(r'(10\.1207/[\w\d]+\&\d+_\d+)') + ) + + for doi_regex in doi_regexes: + match = doi_regex.search(doi) + if match: + new_doi = match.group(0) + break + else: + new_doi = None + + return new_doi diff --git a/tests/test_doi.py b/tests/test_doi.py new file mode 100644 index 0000000..4b345a6 --- /dev/null +++ b/tests/test_doi.py @@ -0,0 +1,80 @@ +# coding: utf8 + +from __future__ import unicode_literals +import unittest + +from pubs.p3 import ustr +from pubs.utils import standardize_doi + +class TestDOIStandardization(unittest.TestCase): + + def setUp(self): + # some of these come from + # https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page + self.crossref_dois = ( + '10.2310/JIM.0b013e31820bab4c', + '10.1007/978-3-642-28108-2_19', + '10.1016/S0735-1097(98)00347-7', + ) + + self.hard_dois = ( + '10.1175/1520-0485(2002)032<0870:CT>2.0.CO;2', + '10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S', + '10.1579/0044-7447(2006)35\[89:RDUICP\]2.0.CO;2', + ) + + self.currently_not_supported = ( + '10.1007.10/978-3-642-28108-2_19', + '10.1000.10/123456', + '10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7', + ) + + def test_http_dxdoi_org(self): + doi = 'http://dx.doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_https_dxdoi_org(self): + doi = 'https://dx.doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_http_doi_org(self): + doi = 'http://doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_https_doi_org(self): + doi = 'https://doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_doi_org(self): + doi = 'doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_dxdoi_org(self): + doi = 'dx.doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_doi_colon_org(self): + doi = 'doi:10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_crossref_dois(self): + for doi in self.crossref_dois: + sdoi = standardize_doi(doi) + self.assertEqual(doi, sdoi) + + def test_hard_dois(self): + for doi in self.hard_dois: + sdoi = standardize_doi(doi) + self.assertEqual(doi, sdoi) + + def test_currently_not_supported(self): + for doi in self.currently_not_supported: + sdoi = standardize_doi(doi) + self.assertIs(sdoi, None) From aea58dea2929beae5f7e11ee0e7a94789f15849e Mon Sep 17 00:00:00 2001 From: Bill Flynn Date: Fri, 5 Jan 2018 21:31:19 -0500 Subject: [PATCH 2/4] [#95] refactored exception into standardize_doi --- pubs/commands/add_cmd.py | 8 ++++---- pubs/utils.py | 35 ++++++++++++++++++----------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/pubs/commands/add_cmd.py b/pubs/commands/add_cmd.py index f17a662..a425434 100644 --- a/pubs/commands/add_cmd.py +++ b/pubs/commands/add_cmd.py @@ -10,14 +10,14 @@ from .. import color from .. import pretty from .. import utils + class ValidateDOI(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): doi = values new_doi = utils.standardize_doi(doi) - if not new_doi: - raise ValueError("Not a valid doi: %s", doi) setattr(namespace, self.dest, new_doi) + def parser(subparsers, conf): parser = subparsers.add_parser('add', help='add a paper to the repository') parser.add_argument('bibfile', nargs='?', default=None, @@ -30,9 +30,9 @@ def parser(subparsers, conf): parser.add_argument('-k', '--citekey', help='citekey associated with the paper;\nif not provided, one will be generated automatically.', default=None) parser.add_argument('-L', '--link', action='store_false', dest='copy', default=True, - help="don't copy document files, just create a link.") + help="don't copy document files, just create a link.") parser.add_argument('-M', '--move', action='store_true', dest='move', default=False, - help="move document instead of of copying (ignored if --link).") + help="move document instead of of copying (ignored if --link).") return parser diff --git a/pubs/utils.py b/pubs/utils.py index bbfeab5..60aadac 100644 --- a/pubs/utils.py +++ b/pubs/utils.py @@ -1,10 +1,10 @@ # Function here may belong somewhere else. In the mean time... - import re from . import color from . import pretty + def resolve_citekey(repo, citekey, ui=None, exit_on_fail=True): """Check that a citekey exists, or autocompletes it if not ambiguous.""" """ :returns found citekey """ @@ -47,6 +47,7 @@ def resolve_citekey_list(repo, citekeys, ui=None, exit_on_fail=True): else: return keys + def standardize_doi(doi): """ Given a putative doi, attempts to always return it in the form of @@ -60,22 +61,22 @@ def standardize_doi(doi): and attempts to verify doi adherence to DOI handbook standards and crossref.org advice: https://www.doi.org/doi_handbook/2_Numbering.html - https://www.crossref.org/blog/dois-and-matching-regular-expressions/""" - """ :returns standardized doi """ - doi_regexes = ( - re.compile(r'(10\.\d{4,9}/[-._;()/:A-z0-9\>\<]+)'), - re.compile(r'(10.1002/[^\s]+)'), - re.compile(r'(10\.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d)'), - re.compile(r'(10\.1021/\w\w\d+\+)'), - re.compile(r'(10\.1207/[\w\d]+\&\d+_\d+)') - ) + https://www.crossref.org/blog/dois-and-matching-regular-expressions/ - for doi_regex in doi_regexes: - match = doi_regex.search(doi) - if match: - new_doi = match.group(0) - break - else: - new_doi = None + :returns standardized doi + """ + + doi_regexes = ( + '(10\.\d{4,9}/[-._;()/:A-z0-9\>\<]+)', + '(10.1002/[^\s]+)', + '(10\.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d)', + '(10\.1021/\w\w\d+\+)', + '(10\.1207/[\w\d]+\&\d+_\d+)') + doi_pattern = re.compile('|'.join(doi_regexes)) + + match = doi_pattern.search(doi) + if not match: + raise ValueError("Not a valid doi: %s", doi) + new_doi = match.group(0) return new_doi From aa408e9b2dbb90c5dcb9938b70db7b169c9a5c7c Mon Sep 17 00:00:00 2001 From: Bill Flynn Date: Fri, 5 Jan 2018 21:31:46 -0500 Subject: [PATCH 3/4] [#95] updated doi tests --- tests/test_doi.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_doi.py b/tests/test_doi.py index 4b345a6..e694973 100644 --- a/tests/test_doi.py +++ b/tests/test_doi.py @@ -3,9 +3,9 @@ from __future__ import unicode_literals import unittest -from pubs.p3 import ustr from pubs.utils import standardize_doi + class TestDOIStandardization(unittest.TestCase): def setUp(self): @@ -59,7 +59,7 @@ class TestDOIStandardization(unittest.TestCase): sdoi = standardize_doi(doi) self.assertEqual(sdoi, '10.1109/5.771073') - def test_doi_colon_org(self): + def test_doi_colon(self): doi = 'doi:10.1109/5.771073' sdoi = standardize_doi(doi) self.assertEqual(sdoi, '10.1109/5.771073') @@ -76,5 +76,5 @@ class TestDOIStandardization(unittest.TestCase): def test_currently_not_supported(self): for doi in self.currently_not_supported: - sdoi = standardize_doi(doi) - self.assertIs(sdoi, None) + with self.assertRaises(ValueError): + standardize_doi(doi) From 098cb4d1bf4068d46af2fa4aa8e316bd2e11c379 Mon Sep 17 00:00:00 2001 From: Bill Flynn Date: Fri, 5 Jan 2018 21:38:47 -0500 Subject: [PATCH 4/4] ran pep8 on pubs/utils --- pubs/utils.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pubs/utils.py b/pubs/utils.py index 60aadac..ba35da5 100644 --- a/pubs/utils.py +++ b/pubs/utils.py @@ -6,25 +6,29 @@ from . import pretty def resolve_citekey(repo, citekey, ui=None, exit_on_fail=True): - """Check that a citekey exists, or autocompletes it if not ambiguous.""" - """ :returns found citekey """ + """Check that a citekey exists, or autocompletes it if not ambiguous. + :returns found citekey + """ # FIXME. Make me optionally non ui interactive/exiting citekeys = repo.citekeys_from_prefix(citekey) if len(citekeys) == 0: if ui is not None: - ui.error("No citekey named or beginning with '{}'".format(color.dye_out(citekey, 'citekey'))) + ui.error("No citekey named or beginning with '{}'".format( + color.dye_out(citekey, 'citekey'))) if exit_on_fail: ui.exit() elif len(citekeys) == 1: if citekeys[0] != citekey: if ui is not None: - ui.info("'{}' has been autocompleted into '{}'.".format(color.dye_out(citekey, 'citekey'), color.dye_out(citekeys[0], 'citekey'))) + ui.info("'{}' has been autocompleted into '{}'.".format( + color.dye_out(citekey, 'citekey'), + color.dye_out(citekeys[0], 'citekey'))) citekey = citekeys[0] elif citekey not in citekeys: if ui is not None: citekeys = sorted(citekeys) - ui.error("Be more specific; '{}' matches multiples citekeys:".format( - citekey)) + ui.error("Be more specific; '{}' matches multiples " + "citekeys:".format(citekey)) for c in citekeys: p = repo.pull_paper(c) ui.message(u' {}'.format(pretty.paper_oneliner(p)))