-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathris.py
79 lines (74 loc) · 2.32 KB
/
ris.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#! /usr/bin/python
# -*- coding: utf-8 -*-
import re
from doi import doi_regex
import commons
def parse(ris_text):
"""Parse RIS_text data and return the result as a dictionary."""
d = {}
# type: (book, journal, . . . )
m = re.search('TY - (.*)', ris_text)
if m:
d['type'] = m.group(1).strip().lower()
# author:
m = re.findall('(:?AU|A\d) - (.*)', ris_text)
# d['authors'] should not be created unless there are some authors
if m:
d['authors'] = []
for match in m:
try:
name = commons.Name(match[1])
except commons.InvalidNameError:
continue
d['authors'].append(name)
m = re.search('(T1|TI) - (.*)', ris_text)
if m:
if m.group(2):
d['title'] = m.group(2).strip()
m = re.search('T3 - (.*)', ris_text)
if m:
d['series'] = m.group(1).strip()
m = re.search('PB - (.*)', ris_text)
if m:
d['publisher'] = m.group(1).strip()
m = re.search('(JF|JA) - (.*)', ris_text)
if m:
if m.group(2):
d['journal'] = m.group(2).strip()
m = re.search('IS - (.*)', ris_text)
if m:
d['issue'] = m.group(1).strip()
m = re.search('VL - (.*)', ris_text)
if m:
d['volume'] = m.group(1).strip()
m = re.search('(PY|Y1|DA) - (\d*)', ris_text)
if m:
if m.group(2):
d['year'] = m.group(2).strip()
m = re.search('(PY|Y1|DA) - \d+/(\d*)', ris_text)
if m:
if m.group(2):
d['month'] = m.group(2).strip()
m = re.search('SN - (.*)', ris_text)
if m:
d['isbn'] = m.group(1).strip()
# DOIs may be in N1 (notes) tag, search for it in any tag
m = re.search(doi_regex, ris_text)
if m:
d['doi'] = m.group(0).strip()
m = re.search('SP - (.*)', ris_text)
if m:
d['startpage'] = m.group(1).strip()
d['pages'] = d['startpage']
m = re.search('EP - (.*)', ris_text)
if m:
d['endpage'] = m.group(1).strip()
d['pages'] = d['startpage'] + '–' + d['endpage']
m = re.search('UR - (.*)', ris_text)
if m:
# in IRS, url can be seprated using a ";"
d['url'] = m.group(1).split(';')[0].strip()
m = re.search('LA - (.*)', ris_text)
if m:
d['language'] = m.group(1).strip()
return d