Skip to content

Commit

Permalink
Improve arXiv fetcher (#6113)
Browse files Browse the repository at this point in the history
No longer include the version string in the `eprint` field, as wished in https://discourse.jabref.org/t/remove-version-in-arxiv-import/1941. Also improved the arXiv identifier parser a bit.

Co-authored-by: Christoph <[email protected]>
  • Loading branch information
tobiasdiez and Siedlerchr authored Mar 13, 2020
1 parent aab7a08 commit 209d336
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 67 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve

### Changed

- We improved the arXiv fetcher. Now it should find entries even more reliably and does no longer include the version (e.g `v1`) in the `eprint` field. [forum#1941](https://discourse.jabref.org/t/remove-version-in-arxiv-import/1941)
- We moved the group search bar and the button "New group" from bottom to top position to make it more prominent. [#6112](https://github.com/JabRef/jabref/pull/6112)


### Fixed

- We fixed an issue where opening a library from the recent libraries menu was not possible. [#5939](https://github.com/JabRef/jabref/issues/5939)
Expand Down
25 changes: 4 additions & 21 deletions src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javax.xml.parsers.DocumentBuilder;
Expand Down Expand Up @@ -59,8 +57,6 @@ public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetche
private static final Logger LOGGER = LoggerFactory.getLogger(ArXiv.class);

private static final String API_URL = "https://export.arxiv.org/api/query";
private static final String ARXIV_URL_PREFIX_FOR_ID = "(https?://arxiv.org/abs/)";
private static final Pattern URL_PATTERN = Pattern.compile(ARXIV_URL_PREFIX_FOR_ID);

private final ImportFormatPreferences importFormatPreferences;

Expand Down Expand Up @@ -106,7 +102,7 @@ private Optional<ArXivEntry> searchForEntry(String searchQuery) throws FetcherEx

private Optional<ArXivEntry> searchForEntryById(String id) throws FetcherException {
Optional<ArXivIdentifier> identifier = ArXivIdentifier.parse(id);
if (!identifier.isPresent()) {
if (identifier.isEmpty()) {
return Optional.empty();
}

Expand Down Expand Up @@ -263,10 +259,8 @@ public List<BibEntry> performSearch(String query) throws FetcherException {

@Override
public Optional<BibEntry> performSearchById(String identifier) throws FetcherException {
String cleanedIdentifier = identifier.replaceAll(" ", "");
cleanedIdentifier = ArXivEntry.createIdString(cleanedIdentifier);

return searchForEntryById(cleanedIdentifier).map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()));
return searchForEntryById(identifier)
.map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()));
}

@Override
Expand Down Expand Up @@ -372,18 +366,7 @@ public Optional<URL> getPdfUrl() {
* Returns the arXiv identifier
*/
public Optional<String> getIdString() {
return urlAbstractPage.map(ArXivEntry::createIdString);
}

public static String createIdString(String id) {
Matcher matcher = URL_PATTERN.matcher(id);
if (matcher.find()) {
// Remove leading http(s)://arxiv.org/abs/ from abstract url to get arXiv ID
return id.substring(matcher.group(1).length());
} else {
return id;
}

return urlAbstractPage.flatMap(ArXivIdentifier::parse).map(ArXivIdentifier::getNormalizedWithoutVersion);
}

public Optional<ArXivIdentifier> getId() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,49 +9,59 @@

import org.jabref.model.entry.field.Field;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.strings.StringUtil;

/**
* Identifier for the arXiv. See https://arxiv.org/help/arxiv_identifier
*/
public class ArXivIdentifier implements Identifier {

private static final String ARXIV_PREFIX = "http(s)?://arxiv.org/(abs|pdf)/|arxiv|arXiv";
private final String identifier;
private final String classification;
private final String version;

ArXivIdentifier(String identifier) {
this(identifier, "");
this(identifier, "", "");
}

ArXivIdentifier(String identifier, String classification) {
this(identifier, "", classification);
}

ArXivIdentifier(String identifier, String version, String classification) {
this.identifier = identifier.trim();
this.version = version.trim();
this.classification = classification.trim();
}

public static Optional<ArXivIdentifier> parse(String value) {
Pattern identifierPattern = Pattern.compile("(arxiv|arXiv)?\\s?:?\\s?(?<id>\\d{4}.\\d{4,5}(v\\d+)?)\\s?(\\[(?<classification>\\S+)\\])?");
Matcher identifierMatcher = identifierPattern.matcher(value);
String identifier = value.replaceAll(" ", "");
Pattern identifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?<id>\\d{4}.\\d{4,5})(v(?<version>\\d+))?\\s?(\\[(?<classification>\\S+)\\])?");
Matcher identifierMatcher = identifierPattern.matcher(identifier);
if (identifierMatcher.matches()) {
String id = identifierMatcher.group("id");
String classification = identifierMatcher.group("classification");
if (classification == null) {
classification = "";
}
return Optional.of(new ArXivIdentifier(id, classification));
String version = identifierMatcher.group("version");
if (version == null) {
version = "";
}
return Optional.of(new ArXivIdentifier(id, version, classification));
}

Pattern oldIdentifierPattern = Pattern.compile("(arxiv|arXiv)?\\s?:?\\s?(?<id>(?<classification>[a-z\\-]+(\\.[A-Z]{2})?)/\\d{7})");
Matcher oldIdentifierMatcher = oldIdentifierPattern.matcher(value);
Pattern oldIdentifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?<id>(?<classification>[a-z\\-]+(\\.[A-Z]{2})?)/\\d{7})(v(?<version>\\d+))?");
Matcher oldIdentifierMatcher = oldIdentifierPattern.matcher(identifier);
if (oldIdentifierMatcher.matches()) {
String id = oldIdentifierMatcher.group("id");
String classification = oldIdentifierMatcher.group("classification");
return Optional.of(new ArXivIdentifier(id, classification));
}

Pattern urlPattern = Pattern.compile("(http://arxiv.org/abs/)(?<id>\\S+)");
Matcher urlMatcher = urlPattern.matcher(value);
if (urlMatcher.matches()) {
String id = urlMatcher.group("id");
return Optional.of(new ArXivIdentifier(id));
String version = oldIdentifierMatcher.group("version");
if (version == null) {
version = "";
}
return Optional.of(new ArXivIdentifier(id, version, classification));
}

return Optional.empty();
Expand Down Expand Up @@ -99,6 +109,14 @@ public Field getDefaultField() {

@Override
public String getNormalized() {
if (StringUtil.isNotBlank(version)) {
return identifier + "v" + version;
} else {
return identifier;
}
}

public String getNormalizedWithoutVersion() {
return identifier;
}

Expand Down
60 changes: 30 additions & 30 deletions src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@
import static org.mockito.Mockito.when;

@FetcherTest
public class ArXivTest {
class ArXivTest {

private ArXiv finder;
private BibEntry entry;
private BibEntry sliceTheoremPaper;

@BeforeEach
public void setUp() {
void setUp() {
ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class);
when(importFormatPreferences.getKeywordSeparator()).thenReturn(',');
finder = new ArXiv(importFormatPreferences);
Expand All @@ -41,107 +41,107 @@ public void setUp() {
sliceTheoremPaper.setField(StandardField.TITLE, "Slice theorem for Fréchet group actions and covariant symplectic field theory");
sliceTheoremPaper.setField(StandardField.DATE, "2014-05-09");
sliceTheoremPaper.setField(StandardField.ABSTRACT, "A general slice theorem for the action of a Fr\\'echet Lie group on a Fr\\'echet manifolds is established. The Nash-Moser theorem provides the fundamental tool to generalize the result of Palais to this infinite-dimensional setting. The presented slice theorem is illustrated by its application to gauge theories: the action of the gauge transformation group admits smooth slices at every point and thus the gauge orbit space is stratified by Fr\\'echet manifolds. Furthermore, a covariant and symplectic formulation of classical field theory is proposed and extensively discussed. At the root of this novel framework is the incorporation of field degrees of freedom F and spacetime M into the product manifold F * M. The induced bigrading of differential forms is used in order to carry over the usual symplectic theory to this new setting. The examples of the Klein-Gordon field and general Yang-Mills theory illustrate that the presented approach conveniently handles the occurring symmetries.");
sliceTheoremPaper.setField(StandardField.EPRINT, "1405.2249v1");
sliceTheoremPaper.setField(StandardField.EPRINT, "1405.2249");
sliceTheoremPaper.setField(StandardField.FILE, ":http\\://arxiv.org/pdf/1405.2249v1:PDF");
sliceTheoremPaper.setField(StandardField.EPRINTTYPE, "arXiv");
sliceTheoremPaper.setField(StandardField.EPRINTCLASS, "math-ph");
sliceTheoremPaper.setField(StandardField.KEYWORDS, "math-ph, math.DG, math.MP, math.SG, 58B99, 58Z05, 58B25, 22E65, 58D19, 53D20, 53D42");
}

@Test
public void findFullTextForEmptyEntryResultsEmptyOptional() throws IOException {
void findFullTextForEmptyEntryResultsEmptyOptional() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void findFullTextRejectsNullParameter() {
void findFullTextRejectsNullParameter() {
assertThrows(NullPointerException.class, () -> finder.findFullText(null));
}

@Test
public void findFullTextByDOI() throws IOException {
void findFullTextByDOI() throws IOException {
entry.setField(StandardField.DOI, "10.1529/biophysj.104.047340");
entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
}

@Test
public void findFullTextByEprint() throws IOException {
void findFullTextByEprint() throws IOException {
entry.setField(StandardField.EPRINT, "1603.06570");
assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
}

@Test
public void findFullTextByEprintWithPrefix() throws IOException {
void findFullTextByEprintWithPrefix() throws IOException {
entry.setField(StandardField.EPRINT, "arXiv:1603.06570");
assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
}

@Test
public void findFullTextByEprintWithUnknownDOI() throws IOException {
void findFullTextByEprintWithUnknownDOI() throws IOException {
entry.setField(StandardField.DOI, "10.1529/unknown");
entry.setField(StandardField.EPRINT, "1603.06570");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
}

@Test
public void findFullTextByTitle() throws IOException {
void findFullTextByTitle() throws IOException {
entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
}

@Test
public void findFullTextByTitleAndPartOfAuthor() throws IOException {
void findFullTextByTitleAndPartOfAuthor() throws IOException {
entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping");
entry.setField(StandardField.AUTHOR, "Weeks and Lucks");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
}

@Test
public void notFindFullTextByUnknownDOI() throws IOException {
void notFindFullTextByUnknownDOI() throws IOException {
entry.setField(StandardField.DOI, "10.1529/unknown");
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void notFindFullTextByUnknownId() throws IOException {
void notFindFullTextByUnknownId() throws IOException {
entry.setField(StandardField.EPRINT, "1234.12345");
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void findFullTextByDOINotAvailableInCatalog() throws IOException {
void findFullTextByDOINotAvailableInCatalog() throws IOException {
entry.setField(StandardField.DOI, "10.1016/0370-2693(77)90015-6");
entry.setField(StandardField.TITLE, "Superspace formulation of supergravity");

assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void searchEntryByPartOfTitle() throws Exception {
void searchEntryByPartOfTitle() throws Exception {
assertEquals(Collections.singletonList(sliceTheoremPaper),
finder.performSearch("ti:\"slice theorem for Frechet\""));
}

@Test
public void searchEntryByPartOfTitleWithAcuteAccent() throws Exception {
void searchEntryByPartOfTitleWithAcuteAccent() throws Exception {
assertEquals(Collections.singletonList(sliceTheoremPaper),
finder.performSearch("ti:\"slice theorem for Fréchet\""));
}

@Test
public void searchEntryByOldId() throws Exception {
void searchEntryByOldId() throws Exception {
BibEntry expected = new BibEntry();
expected.setType(StandardEntryType.Article);
expected.setField(StandardField.AUTHOR, "H1 Collaboration");
expected.setField(StandardField.TITLE, "Multi-Electron Production at High Transverse Momenta in ep Collisions at HERA");
expected.setField(StandardField.DATE, "2003-07-07");
expected.setField(StandardField.ABSTRACT, "Multi-electron production is studied at high electron transverse momentum in positron- and electron-proton collisions using the H1 detector at HERA. The data correspond to an integrated luminosity of 115 pb-1. Di-electron and tri-electron event yields are measured. Cross sections are derived in a restricted phase space region dominated by photon-photon collisions. In general good agreement is found with the Standard Model predictions. However, for electron pair invariant masses above 100 GeV, three di-electron events and three tri-electron events are observed, compared to Standard Model expectations of 0.30 \\pm 0.04 and 0.23 \\pm 0.04, respectively.");
expected.setField(StandardField.EPRINT, "hep-ex/0307015v1");
expected.setField(StandardField.EPRINT, "hep-ex/0307015");
expected.setField(StandardField.FILE, ":http\\://arxiv.org/pdf/hep-ex/0307015v1:PDF");
expected.setField(StandardField.EPRINTTYPE, "arXiv");
expected.setField(StandardField.EPRINTCLASS, "hep-ex");
Expand All @@ -153,61 +153,61 @@ public void searchEntryByOldId() throws Exception {
}

@Test
public void searchEntryByIdWith4DigitsAndVersion() throws Exception {
void searchEntryByIdWith4DigitsAndVersion() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("1405.2249v1"));
}

@Test
public void searchEntryByIdWith4Digits() throws Exception {
void searchEntryByIdWith4Digits() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("1405.2249"));
}

@Test
public void searchEntryByIdWith4DigitsAndPrefix() throws Exception {
void searchEntryByIdWith4DigitsAndPrefix() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("arXiv:1405.2249"));
}

@Test
public void searchEntryByIdWith4DigitsAndPrefixAndNotTrimmed() throws Exception {
void searchEntryByIdWith4DigitsAndPrefixAndNotTrimmed() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("arXiv : 1405. 2249"));
}

@Test
public void searchEntryByIdWith5Digits() throws Exception {
void searchEntryByIdWith5Digits() throws Exception {
assertEquals(Optional.of(
"An Optimal Convergence Theorem for Mean Curvature Flow of Arbitrary Codimension in Hyperbolic Spaces"),
finder.performSearchById("1503.06747").flatMap(entry -> entry.getField(StandardField.TITLE)));
}

@Test
public void searchWithMalformedIdThrowsException() throws Exception {
void searchWithMalformedIdThrowsException() throws Exception {
assertThrows(FetcherException.class, () -> finder.performSearchById("123412345"));
}

@Test
public void searchIdentifierForSlicePaper() throws Exception {
void searchIdentifierForSlicePaper() throws Exception {
sliceTheoremPaper.clearField(StandardField.EPRINT);

assertEquals(ArXivIdentifier.parse("1405.2249v1"), finder.findIdentifier(sliceTheoremPaper));
assertEquals(ArXivIdentifier.parse("1405.2249"), finder.findIdentifier(sliceTheoremPaper));
}

@Test
public void searchEmptyId() throws Exception {
void searchEmptyId() throws Exception {
assertEquals(Optional.empty(), finder.performSearchById(""));
}

@Test
public void searchWithHttpUrl() throws Exception {
void searchWithHttpUrl() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("http://arxiv.org/abs/1405.2249"));
}

@Test
public void searchWithHttpsUrl() throws Exception {
void searchWithHttpsUrl() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("https://arxiv.org/abs/1405.2249"));
}

@Test
public void searchWithHttpsUrlNotTrimmed() throws Exception {
void searchWithHttpsUrlNotTrimmed() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("https : // arxiv . org / abs / 1405 . 2249 "));
}
}
Loading

0 comments on commit 209d336

Please sign in to comment.