Skip to content

Commit

Permalink
Merge pull request #12 from vomaksh/dev
Browse files Browse the repository at this point in the history
chore: pending updates
  • Loading branch information
vomaksh authored Aug 3, 2024
2 parents 0c83e97 + 451048c commit 2aaee1a
Show file tree
Hide file tree
Showing 17 changed files with 622 additions and 429 deletions.
63 changes: 63 additions & 0 deletions .github/workflows/master.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: Create and publish images

on:
push:
branches:
- master

jobs:
publish:
runs-on: ubuntu-latest

permissions:
packages: write
contents: read
attestations: write
id-token: write

steps:
- uses: actions/checkout@v4

- name: Log in to the Container registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Create .env file
run: |
echo "POSTGRES_URI=${{ secrets.APP_POSTGRES_URI }}" > .env
- name: Build and push listener image
uses: docker/build-push-action@v5
with:
context: .
file: src/listener/Dockerfile
push: true
tags: ghcr.io/${{ github.repository }}/listener:latest

- name: Build and push top_stories image
uses: docker/build-push-action@v5
with:
context: .
file: src/top_stories/Dockerfile
push: true
tags: ghcr.io/${{ github.repository }}/top_stories:latest

- name: Setup ssh key
run: |
mkdir -p ~/.ssh/
echo "${{ secrets.ACTION_PRIVATE_KEY }}" > ~/.ssh/github_action.key
sudo chmod 600 ~/.ssh/github_action.key
ssh-keyscan -H ${{ secrets.SERVER_IP }} > ~/.ssh/known_hosts
- name: Deploy
run: |
ssh ${{ secrets.SERVER_USERNAME }}@${{ secrets.SERVER_IP }} -i ~/.ssh/github_action.key /bin/bash << EOF
cd ${{ secrets.REPO_PATH }};
git pull origin master;
docker compose down;
docker compose pull;
docker compose up -d;
EOF
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.env
__pycache__
.vscode
.idea
.idea
.venv
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# HNScraper
# hckernews

Scrapes relevant stories and their comments for learning.
It scrapes historical stories and comments and also polls for new stories and extracts their comments.
Expand Down
34 changes: 34 additions & 0 deletions docker-compose.dev.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
x-healthcheck-config: &healthcheck-config
interval: 10s
timeout: 10s
retries: 5

services:
listener:
build:
context: .
dockerfile: src/listener/Dockerfile
depends_on:
- postgres

top_stories:
build:
context: .
dockerfile: src/top_stories/Dockerfile
depends_on:
- postgres
- listener

postgres:
image: postgres:16.3-alpine
ports:
- "5432:5432"
env_file:
- .env
healthcheck:
<<: *healthcheck-config
volumes:
- pg_data:/var/lib/postgresql/data

volumes:
pg_data:
31 changes: 6 additions & 25 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,27 +1,8 @@
x-healthcheck-config: &healthcheck-config
interval: 10s
timeout: 10s
retries: 5

services:
app:
build:
context: .
dockerfile: Dockerfile
listener:
image: ghcr.io/ginruh/hckernews/listener:latest

top_stories:
image: ghcr.io/ginruh/hckernews/top_stories:latest
depends_on:
- mongodb

mongodb:
image: mongo:7.0.9
ports:
- "27017:27017"
env_file:
- .env
healthcheck:
<<: *healthcheck-config
test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet
volumes:
- mongodb_data:/data/db

volumes:
mongodb_data:
- listener
465 changes: 204 additions & 261 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ packages = [
python = "^3.12"


odmantic = "^1.0.2"
aiohttp = "^3.9.5"
python-dotenv = "^1.0.1"
sqlalchemy = {extras = ["asyncio"], version = "^2.0.31"}
asyncpg = "^0.29.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.scripts]
app = "src:main"
listener = "src:listener.main"
top_stories = "src:top_stories.main"
27 changes: 0 additions & 27 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -1,27 +0,0 @@
import asyncio
from typing import cast
from dotenv import load_dotenv
from os import getenv
from src.db import connect_db
from src.hackernews import HackerNews


async def main():
load_dotenv()
engine = connect_db(
mongodb_uri=cast(str, getenv("MONGODB_URI")),
database=cast(str, getenv("MONGODB_DATABASE")),
)
HackerNews.set_engine(engine=engine)
latest_item_id = await HackerNews.get_latest_item()
print(f"Latest item ID: {latest_item_id}")
if latest_item_id is None:
raise Exception("Unable to fetch latest item_id. Exiting")
tasks = [
asyncio.create_task(HackerNews.fetch_story_items(end_item=latest_item_id)),
asyncio.create_task(HackerNews.listen_updates()),
]
await asyncio.gather(*tasks)


asyncio.run(main())
Empty file added src/common/__init__.py
Empty file.
136 changes: 136 additions & 0 deletions src/common/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from datetime import datetime
from typing import Optional
from sqlalchemy import ForeignKey, select, func, ARRAY, Integer
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncSession
from sqlalchemy.dialects.postgresql import Insert


class Base(DeclarativeBase):
pass


class Item(Base):
__tablename__ = "item"

id: Mapped[int] = mapped_column(primary_key=True)
deleted: Mapped[bool] = mapped_column(default=False)
type: Mapped[str]
by: Mapped[Optional[str]] = mapped_column(nullable=True)
time: Mapped[int]
dead: Mapped[bool] = mapped_column(default=False)
parent: Mapped[Optional[int]] = mapped_column(ForeignKey("item.id"), nullable=True)
kids: Mapped[list[int]] = mapped_column(ARRAY(Integer), default=[])
url: Mapped[Optional[str]] = mapped_column(nullable=True)
score: Mapped[Optional[int]] = mapped_column(nullable=True)
title: Mapped[Optional[str]] = mapped_column(nullable=True) # null for comment
text: Mapped[Optional[str]] = mapped_column(nullable=True) # maybe null for story
descendants: Mapped[Optional[int]] = mapped_column(default=0)


class TopStories(Base):
__tablename__ = "top_stories"

story_item_id: Mapped[int] = mapped_column(primary_key=True)
created_at: Mapped[datetime] = mapped_column(
primary_key=True, server_default=func.now()
)


async def connect_db(postgres_uri: str) -> async_sessionmaker[AsyncSession]:
engine = create_async_engine(postgres_uri)

async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)

return async_sessionmaker(engine, expire_on_commit=False)


async def save_items(
*, async_session: async_sessionmaker[AsyncSession], items: list[dict | None]
) -> list[Item]:
if len(items) == 0:
return []
filtered_items: list[dict] = []
for item in items:
# removing failed request items
if item is None:
continue
# remove item with no type
if item.get("type") is None:
continue
# not interested in jobs and polls
if item.get("type") != "story" and item.get("type") != "comment":
continue
if item.get("type") == "story":
# removing spams
if item.get("score", 0) >= 3:
filtered_items.append(item)
if item.get("type") == "comment":
filtered_items.append(item)
hn_items = [
dict(
id=f_item["id"],
type=f_item["type"],
time=f_item["time"],
by=f_item.get("by", None),
dead=f_item.get("dead", False),
deleted=f_item.get("deleted", False),
descendants=f_item.get("descendants", 0),
kids=f_item.get("kids", []),
parent=f_item.get("parent", None),
score=f_item.get("score", None),
text=f_item.get("text", None),
title=f_item.get("title", None),
url=f_item.get("url", None),
)
for f_item in filtered_items
]
async with async_session() as session:
async with session.begin():
stmt = Insert(Item).values(hn_items)
result = await session.scalars(
stmt.on_conflict_do_update(
index_elements=["id"],
set_={
"type": stmt.excluded.type,
"time": stmt.excluded.time,
"by": stmt.excluded.by,
"dead": stmt.excluded.dead,
"deleted": stmt.excluded.deleted,
"descendants": stmt.excluded.descendants,
"kids": stmt.excluded.kids,
"parent": stmt.excluded.parent,
"score": stmt.excluded.score,
"text": stmt.excluded.text,
"title": stmt.excluded.title,
"url": stmt.excluded.url,
},
).returning(Item)
)
return list(result.all())


async def get_item(*, async_session: async_sessionmaker[AsyncSession], item_id: int):
async with async_session() as session:
stmt = select(Item).where(Item.id == item_id)
items = await session.scalars(stmt)
return items.first()


async def save_top_stories(
*, async_session: async_sessionmaker[AsyncSession], stories: list[int]
):
top_stories = [TopStories(story_item_id=story_item_id) for story_item_id in stories]
async with async_session() as session:
async with session.begin():
session.add_all(top_stories)


async def bulk_find_items(
*, async_session: async_sessionmaker[AsyncSession], items: list[int]
):
async with async_session() as session:
async with session.begin():
result = await session.scalars(select(Item).where(Item.id.in_(items)))
return list(result.all())
Loading

0 comments on commit 2aaee1a

Please sign in to comment.