-
Notifications
You must be signed in to change notification settings - Fork 6
/
get_DFG_documents.sh
executable file
·276 lines (253 loc) · 11.8 KB
/
get_DFG_documents.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/bin/bash
# This script downloads and reasonably renames (pdf and rtf) documents
# from the DFG web site associated with a particular funding instrument.
#
# Get help and usage examples via
# get_DFG_documents.sh
# Set debugging flag
_DEBUG=0
# Declare intelligent debug function
# from http://www.cyberciti.biz/tips/debugging-shell-script.html
function DEBUG()
{
[ "$_DEBUG" -ne "0" ] && $@
}
function NODEBUG()
{
[ "$_DEBUG" -eq "0" ] && $@
}
# Declare a function which lists some information about all documents
function list()
{
for i in ${!formNr[@]}; do
printf -- '%-14s %-4s %-2s %-7s %-30s\n' "${formNr[$i]}" "${formFileType[$i]}" "${formLanguage[$i]}" "${formDate[$i]}" "${formTitle[$i]}"
done
}
# Issue help message if necessary
if [ $# = 0 ] || [[ $# -gt 1 && $2 != "list" && $2 != "get" ]]; then
echo
echo "Usage: $(basename $0) URL (lists all documents, no downloads)"
echo "Usage: $(basename $0) URL list (lists all documents, no downloads)"
echo "Usage: $(basename $0) URL get [doc number 1] [doc number 2] (retrieves documents whose numbers are given, in all languages)"
echo "Usage: $(basename $0) URL get de [doc number 1] en [doc number 2] (retrieves some documents in German, some in English)"
echo "Usage: $(basename $0) URL get . (retrieves all forms)"
echo
echo "Example: $(basename $0) https://www.dfg.de/foerderung/programme/einzelfoerderung/sachbeihilfe/formulare_merkblaetter/index.jsp"
echo "Example: $(basename $0) https://www.dfg.de/foerderung/programme/einzelfoerderung/sachbeihilfe/formulare_merkblaetter/index.jsp list"
echo "Example: $(basename $0) https://www.dfg.de/foerderung/programme/einzelfoerderung/sachbeihilfe/formulare_merkblaetter/index.jsp get 52.01 '60.12 -2018-'"
echo "Example: $(basename $0) https://www.dfg.de/foerderung/programme/einzelfoerderung/sachbeihilfe/formulare_merkblaetter/index.jsp get de 52.01 '60.12 -2018-'"
echo "Example: $(basename $0) https://www.dfg.de/foerderung/programme/einzelfoerderung/sachbeihilfe/formulare_merkblaetter/index.jsp get en 52.01 de '60.12 -2018-'"
echo "Example: $(basename $0) https://www.dfg.de/foerderung/programme/einzelfoerderung/sachbeihilfe/formulare_merkblaetter/index.jsp get de '60.12$"
echo "Example: $(basename $0) https://www.dfg.de/foerderung/programme/einzelfoerderung/sachbeihilfe/formulare_merkblaetter/index.jsp get de '^1\.'"
echo
echo "Document numbers are interpreted as bash regular expressions."
echo "Downloaded documents will be given reasonable names."
echo
exit 1
fi
# Download the URL into a temporary file
URL=$1
DOWNLOAD=$(mktemp)
DEBUG echo Download file $DOWNLOAD
wget --quiet $URL -O $DOWNLOAD
# Typical relevant records look like this (spaces not to scale) [:shudder:]
#
# Example for a pdf document
# <td class="first"><span>50.01</span></td>
# <td><abbr title="Deutsch"><span class="contentType">de<span></span></span></abbr></td>
# <td class="titel"><strong>Merkblatt Programm Sachbeihilfe [04/21]</a></strong></td>
# <td><a href="https://www.dfg.de/formulare/50_01/50_01_de.pdf" target="_blank">PDF</a></td>
#
# Example for a pdf and rtf document
# <td class="first"><span>54.011</span></td>
# <td><abbr title="Deutsch"><span class="contentType">de<span></span></span></abbr></td>
# <td class="titel">Daten zum Antrag und Verpflichtungen - Projektanträge (nur für Programme, in denen eine Antragstellung über das elan-Portal noch nicht möglich ist) [04/21]</a></td>
# <td><a href="https://www.dfg.de/formulare/54_011/54_011_de.pdf" target="_blank">PDF</a>, <a href="https://www.dfg.de/formulare/54_011/54_011_de_rtf.rtf" target="_blank">RTF</a></td>
#
# Example for a jsp link, which we do not follow up on
# <td class="first"><span>2.00</span></td>
# <td><abbr title="Deutsch"><span class="contentType">de<span></span></span></abbr></td>
# <td class="titel"><a href="https://www.dfg.de/formulare/2_00/index.jsp" target="_blank">Verwendungsrichtlinien - Allgemeine Bedingungen für Förderverträge mit der Deutschen Forschungsgemeinschaft e.V. (DFG)</a></td>
# <td><a href="https://www.dfg.de/formulare/2_00/index.jsp" target="_blank"></a></td>
#
# Example for documents with title containing a stray <span> tag
# Possibly no longer relevant as of 20220923
# <td class="first"><span>21.40</span></td>
# <td><abbr title="Deutsch"><span class="contentType">de<span></span></span></abbr></td>
# <td class="titel"><span size="5">Antrag für Großgeräte in Forschungsbauten </span>nach Art. 91b GG [03/18]</a></td>
# <td><a href="http://www.dfg.de/formulare/21_40/21_40_de.pdf">PDF</a>, <a href="http://www.dfg.de/formulare/21_40/21_40_de_rtf.rtf">RTF</a></td>
# Define the start pattern to recognize a relevant entry
# Make sure that quantifiers are non-greedy and / properly escaped
# https://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
STARTPATTERN='<td class="first"><span>.*?</span></td>'
STARTPATTERN=$(echo $STARTPATTERN | awk '{gsub(/\//,"\\/",$0);} 1')
DEBUG echo "START" $STARTPATTERN
DEBUG echo
# Define an awk program which filters out chunks of four lines, each beginning with the start pattern
PROGFILTER="/$STARTPATTERN/ {for (i=1; i<=4; i++) {print; getline}}"
DEBUG echo $PROGFILTER
DEBUG echo
# Define an awk program to replace multiple spaces by just one
PROGREDUCESPACES='{gsub(/ [ ]+/," ",$0);} 1'
# Define an awk program to beautify and decode some HTML stuff
PROGHTMLDECODE='{
# replace HTML umlaut encodings to plain umlauts
gsub(/Ä/,"Ä");
gsub(/Ö/,"Ö");
gsub(/Ü/,"Ü");
gsub(/ä/,"ä");
gsub(/ö/,"ö");
gsub(/ü/,"ü");
gsub(/ß/,"ß");
gsub(/"/,"");
gsub(/–/,"-");
gsub(/ /," ");
print
}'
# Define an awk program to remove lines containing links to jsp files
PROGNOJSP='!/\.jsp/'
# Define an intermediate file
INTERMEDIATE=$(mktemp)
# Let awk do its magic to filter the relevant lines, remove duplicate spaces, decode HTML stuff, and paste chunks of four lines into one
# Then remove lines linking to non-directly downloadable content, i.e., other jsp files (Verwendungsrichtlinien)
awk "$PROGFILTER" $DOWNLOAD | awk "$PROGREDUCESPACES" - | awk "$PROGHTMLDECODE" - | paste -d ' ' - - - - | tr -d "\r" | awk "$PROGNOJSP" - > $INTERMEDIATE
# gvim -p $DOWNLOAD $INTERMEDIATE
# exit 2
# Create the search pattern for regular downloadable documents
LINEPATTERN='<td class="first"><span>(.*)</span></td>[[:space:]]*<td><abbr title="[a-zA-Z]+"><span class="contentType">(..)<span></span></span></abbr></td>[[:space:]]*<td class="titel">(<span size=[^>]*>)?(<strong>)?([^<]*)(</span>)?(.*) (\[.*\])</a>(</strong>)?</td>[[:space:]]*<td><a href="([^>]*)"[[:space:]]+(target="_blank")?>([A-Z]{3,4})</a>(, <a href="([^>]*)"[[:space:]]+(target="_blank")?>([A-Z]{3,4})</a>)?</td>'
# Extract the relevant pieces of information from each line
i=0
while read LINE; do
DEBUG echo "Raw line: " ${LINE}
if [[ ${LINE} =~ ${LINEPATTERN} ]]; then
formNr[$i]=${BASH_REMATCH[1]}
formLanguage[$i]=${BASH_REMATCH[2]}
formTitle[$i]=${BASH_REMATCH[5]}${BASH_REMATCH[7]}
formDate[$i]=${BASH_REMATCH[8]}
formURL[$i]=${BASH_REMATCH[10]}
formFileType[$i]=${BASH_REMATCH[12]}
formURLSecond=${BASH_REMATCH[14]}
formFileTypeSecond=${BASH_REMATCH[16]}
# Canonicalize the form date
# [07/10] -> [20100700]
# [2018] -> [20180000]
date_pattern1='\[([[:digit:]]{2})/([[:digit:]]{2})\]'
date_pattern2='\[([[:digit:]]{4})\]'
if [[ "${formDate[$i]}" =~ $date_pattern1 ]]; then
month=${BASH_REMATCH[1]}
year=20${BASH_REMATCH[2]}
elif [[ "${formDate[$i]}" =~ $date_pattern2 ]]; then
year=${BASH_REMATCH[1]}
month=00
else
echo ERROR parsing date "${formDate[$i]}"
exit 1
fi
formDateCanonicalized[$i]=$year$month"00"
# Canonicalize the form number (replace hyphens, spaces)
formNrCanonicalized[$i]=${formNr[$i]}
formNrCanonicalized[$i]=${formNrCanonicalized[$i]// /_} # replace spaces by underscores
formNrCanonicalized[$i]=${formNrCanonicalized[$i]//-/} # remove hyphens (as in '60.12 -2012-')
# Canonicalize the form title (replace hyphens, spaces)
formTitleCanonicalized[$i]=${formTitle[$i]}
formTitleCanonicalized[$i]=${formTitleCanonicalized[$i]// /_} # replace spaces by underscores
formTitleCanonicalized[$i]=${formTitleCanonicalized[$i]//_-/} # remove '_-' (as in 'Leitfaden für die Antragstellung - Projektanträge')
formTitleCanonicalized[$i]=${formTitleCanonicalized[$i]//_\/_/_} # replace '_/_' by '_' ( as in 'Antrag auf Reparatur / Ersatz / Ergänzung einer DFG-Leihgabe')
formTitleCanonicalized[$i]=${formTitleCanonicalized[$i]//\//_} # replace '/' by '_' ( as in 'SFB/Transregio')
# Canonicalize the file type (to lower case)
formFileTypeCanonicalized[$i]=$(echo ${formFileType[$i]} | tr '[:upper:]' '[:lower:]')
DEBUG echo "Form number: " ${formNr[$i]}
DEBUG echo "Form number canonicalized: " ${formNrCanonicalized[$i]}
DEBUG echo "Form language: " ${formLanguage[$i]}
DEBUG echo "Form title: " ${formTitle[$i]}
DEBUG echo "Form title canonicalized: " ${formTitleCanonicalized[$i]}
DEBUG echo "Form date: " ${formDate[$i]}
DEBUG echo "Form date canonicalized: " ${formDateCanonicalized[$i]}
DEBUG echo "Form URL: " ${formURL[$i]}
DEBUG echo "Form file type: " ${formFileType[$i]}
DEBUG echo "Form 2nd URL: " ${formURLSecond}
DEBUG echo "Form 2nd file type: " ${formFileTypeSecond}
DEBUG echo
else
echo "ERROR parsing line" ${LINE}
echo "in $INTERMEDIATE"
exit 1
fi
(( i++ ))
# If we have a second entry on the same line (like pdf and rtf), create an individual entry for it
if [ ! -z "${formURLSecond}" ]; then
formNr[$i]=${formNr[$i-1]}
formNrCanonicalized[$i]=${formNrCanonicalized[$i-1]}
formLanguage[$i]=${formLanguage[$i-1]}
formTitle[$i]=${formTitle[$i-1]}
formTitleCanonicalized[$i]=${formTitleCanonicalized[$i-1]}
formDate[$i]=${formDate[$i-1]}
formDateCanonicalized[$i]=${formDateCanonicalized[$i-1]}
formURL[$i]=${formURLSecond}
formFileType[$i]=${formFileTypeSecond}
formFileTypeCanonicalized[$i]=$(echo ${formFileType[$i]} | tr '[:upper:]' '[:lower:]')
(( i++ ))
fi
done < $INTERMEDIATE
# If no download links were found at all, exit
if [ $i -eq 0 ]; then
echo No download links found.
echo It appears that $URL does not exist, or it is not of the expected type.
NODEBUG rm $DOWNLOAD
NODEBUG rm $INTERMEDIATE
exit 1
fi
# If no arguments except for the URL are given, list all the forms found and exit
if [ $# = 1 ] || [[ $2 == "list" ]]; then
list
NODEBUG rm $DOWNLOAD
NODEBUG rm $INTERMEDIATE
exit 0
fi
# If the command 'get' is given, continue to parse all further arguments
if [ $2 == "get" ]; then
# Default to documents in both languages
en=1
de=1
# Eat two arguments before list of documents to be retrieved
shift
shift
while [[ $# -gt 0 ]]
do
key=$1
DEBUG echo Parsing $key
case $key in
# Switch to English documents from now on only
en)
en=1
de=0
;;
# Switch to German documents from now on only
de)
en=0
de=1
;;
# Expect a document number to be downloaded
*)
# Loop over all forms
for i in ${!formNr[@]}; do
# Try to find the key in its title
# If the language is among the desired languages, download it
if [[ ${formNr[$i]} =~ $key ]] && [[ $(eval echo \$${formLanguage[$i]}) == 1 ]]; then
filename=${formDateCanonicalized[$i]}_${formNrCanonicalized[$i]}_${formLanguage[$i]}_${formTitleCanonicalized[$i]}.${formFileTypeCanonicalized[$i]}
echo Downloading ${formNr[$i]} in language ${formLanguage[$i]} from ${formURL[$i]} as $filename
wget --quiet ${formURL[$i]} --output-document $filename || exit 1
fi
done
;;
esac
shift
done
didSomething=1
fi
# Exit (keeping temporary files), or clean up and exit
DEBUG exit 0
# Clean up and exit
rm $DOWNLOAD
rm $INTERMEDIATE
exit 0