Skip to content

Commit

Permalink
Add component google scholar (#1790)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

#1739 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
  • Loading branch information
guoyuhao2330 authored Aug 2, 2024
1 parent 418700b commit 5d55e6a
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 4 deletions.
1 change: 1 addition & 0 deletions graph/component/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .arxiv import ArXiv, ArXivParam
from .google import Google, GoogleParam
from .bing import Bing, BingParam
from .googlescholar import GoogleScholar, GoogleScholarParam


def component_class(class_name):
Expand Down
70 changes: 70 additions & 0 deletions graph/component/googlescholar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from abc import ABC
import pandas as pd
from graph.settings import DEBUG
from graph.component.base import ComponentBase, ComponentParamBase
from scholarly import scholarly


class GoogleScholarParam(ComponentParamBase):
"""
Define the GoogleScholar component parameters.
"""

def __init__(self):
super().__init__()
self.top_n = 6
self.sort_by = 'relevance'
self.year_low = None
self.year_high = None
self.patents = True

def check(self):
self.check_positive_integer(self.top_n, "Top N")
self.check_valid_value(self.sort_by, "GoogleScholar Sort_by", ['date', 'relevance'])
self.check_boolean(self.patents, "Whether or not to include patents, defaults to True")


class GoogleScholar(ComponentBase, ABC):
component_name = "GoogleScholar"

def _run(self, history, **kwargs):
ans = self.get_input()
ans = " - ".join(ans["content"]) if "content" in ans else ""
if not ans:
return GoogleScholar.be_output("")

scholar_client = scholarly.search_pubs(ans, patents=self._param.patents, year_low=self._param.year_low,
year_high=self._param.year_high, sort_by=self._param.sort_by)
scholar_res = []
for i in range(self._param.top_n):
try:
pub = next(scholar_client)
scholar_res.append({"content": 'Title: ' + pub['bib']['title'] + '\n_Url: <a href="' + pub[
'pub_url'] + '"></a> ' + "\n author: " + ",".join(pub['bib']['author']) + '\n Abstract: ' + pub[
'bib'].get('abstract', 'no abstract')})

except StopIteration or Exception as e:
print("**ERROR** " + str(e))
break

if not scholar_res:
return GoogleScholar.be_output("")

df = pd.DataFrame(scholar_res)
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
return df
7 changes: 5 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ datrie==0.8.2
demjson3==3.0.6
discord.py==2.3.2
duckduckgo_search==6.1.9
editdistance==0.8.1
elastic_transport==8.12.0
elasticsearch==8.12.1
elasticsearch_dsl==8.12.0
Expand All @@ -31,7 +32,9 @@ httpx==0.27.0
huggingface_hub==0.20.3
infinity_emb==0.0.51
itsdangerous==2.1.2
jina==3.27.2
Markdown==3.6
markdown_to_json==2.1.1
minio==7.2.4
mistralai==0.4.2
nltk==3.8.1
Expand All @@ -51,6 +54,7 @@ pipreqs==0.5.0
protobuf==5.27.2
pyclipper==1.3.0.post5
pycryptodomex==3.20.0
pypdf==4.3.0
PyPDF2==3.0.1
pytest==8.2.2
python-dotenv==1.0.1
Expand All @@ -61,6 +65,7 @@ redis==5.0.3
Requests==2.32.2
roman_numbers==1.0.2
ruamel.base==1.0.0
scholarly==1.7.11
scikit_learn==1.5.0
selenium==4.22.0
setuptools==70.0.0
Expand All @@ -80,5 +85,3 @@ word2number==1.1
xgboost==2.1.0
xpinyin==0.7.6
zhipuai==2.0.1
pypdf==4.3.0
jina==3.27.2
5 changes: 4 additions & 1 deletion requirements_arm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,7 @@ Bio==1.7.1
arxiv==2.1.3
pypdf==4.3.0
google_search_results==2.4.2
jina==3.27.2
jina==3.27.2
editdistance==0.8.1
markdown_to_json==2.1.1
scholarly==1.7.11
5 changes: 4 additions & 1 deletion requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,7 @@ Bio==1.7.1
arxiv==2.1.3
pypdf==4.3.0
google_search_results==2.4.2
jina==3.27.2
jina==3.27.2
editdistance==0.8.1
markdown_to_json==2.1.1
scholarly==1.7.11

0 comments on commit 5d55e6a

Please sign in to comment.