Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: pending updates #12

Merged
merged 22 commits into from
Aug 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions .github/workflows/master.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: Create and publish images

on:
push:
branches:
- master

jobs:
publish:
runs-on: ubuntu-latest

permissions:
packages: write
contents: read
attestations: write
id-token: write

steps:
- uses: actions/checkout@v4

- name: Log in to the Container registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Create .env file
run: |
echo "POSTGRES_URI=${{ secrets.APP_POSTGRES_URI }}" > .env

- name: Build and push listener image
uses: docker/build-push-action@v5
with:
context: .
file: src/listener/Dockerfile
push: true
tags: ghcr.io/${{ github.repository }}/listener:latest

- name: Build and push top_stories image
uses: docker/build-push-action@v5
with:
context: .
file: src/top_stories/Dockerfile
push: true
tags: ghcr.io/${{ github.repository }}/top_stories:latest

- name: Setup ssh key
run: |
mkdir -p ~/.ssh/
echo "${{ secrets.ACTION_PRIVATE_KEY }}" > ~/.ssh/github_action.key
sudo chmod 600 ~/.ssh/github_action.key
ssh-keyscan -H ${{ secrets.SERVER_IP }} > ~/.ssh/known_hosts

- name: Deploy
run: |
ssh ${{ secrets.SERVER_USERNAME }}@${{ secrets.SERVER_IP }} -i ~/.ssh/github_action.key /bin/bash << EOF
cd ${{ secrets.REPO_PATH }};
git pull origin master;
docker compose down;
docker compose pull;
docker compose up -d;
EOF
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.env
__pycache__
.vscode
.idea
.idea
.venv
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# HNScraper
# hckernews

Scrapes relevant stories and their comments for learning.
It scrapes historical stories and comments and also polls for new stories and extracts their comments.
Expand Down
34 changes: 34 additions & 0 deletions docker-compose.dev.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
x-healthcheck-config: &healthcheck-config
interval: 10s
timeout: 10s
retries: 5

services:
listener:
build:
context: .
dockerfile: src/listener/Dockerfile
depends_on:
- postgres

top_stories:
build:
context: .
dockerfile: src/top_stories/Dockerfile
depends_on:
- postgres
- listener

postgres:
image: postgres:16.3-alpine
ports:
- "5432:5432"
env_file:
- .env
healthcheck:
<<: *healthcheck-config
volumes:
- pg_data:/var/lib/postgresql/data

volumes:
pg_data:
31 changes: 6 additions & 25 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,27 +1,8 @@
x-healthcheck-config: &healthcheck-config
interval: 10s
timeout: 10s
retries: 5

services:
app:
build:
context: .
dockerfile: Dockerfile
listener:
image: ghcr.io/ginruh/hckernews/listener:latest

top_stories:
image: ghcr.io/ginruh/hckernews/top_stories:latest
depends_on:
- mongodb

mongodb:
image: mongo:7.0.9
ports:
- "27017:27017"
env_file:
- .env
healthcheck:
<<: *healthcheck-config
test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet
volumes:
- mongodb_data:/data/db

volumes:
mongodb_data:
- listener
465 changes: 204 additions & 261 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ packages = [
python = "^3.12"


odmantic = "^1.0.2"
aiohttp = "^3.9.5"
python-dotenv = "^1.0.1"
sqlalchemy = {extras = ["asyncio"], version = "^2.0.31"}
asyncpg = "^0.29.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.scripts]
app = "src:main"
listener = "src:listener.main"
top_stories = "src:top_stories.main"
27 changes: 0 additions & 27 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -1,27 +0,0 @@
import asyncio
from typing import cast
from dotenv import load_dotenv
from os import getenv
from src.db import connect_db
from src.hackernews import HackerNews


async def main():
load_dotenv()
engine = connect_db(
mongodb_uri=cast(str, getenv("MONGODB_URI")),
database=cast(str, getenv("MONGODB_DATABASE")),
)
HackerNews.set_engine(engine=engine)
latest_item_id = await HackerNews.get_latest_item()
print(f"Latest item ID: {latest_item_id}")
if latest_item_id is None:
raise Exception("Unable to fetch latest item_id. Exiting")
tasks = [
asyncio.create_task(HackerNews.fetch_story_items(end_item=latest_item_id)),
asyncio.create_task(HackerNews.listen_updates()),
]
await asyncio.gather(*tasks)


asyncio.run(main())
Empty file added src/common/__init__.py
Empty file.
136 changes: 136 additions & 0 deletions src/common/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from datetime import datetime
from typing import Optional
from sqlalchemy import ForeignKey, select, func, ARRAY, Integer
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncSession
from sqlalchemy.dialects.postgresql import Insert


class Base(DeclarativeBase):
pass


class Item(Base):
__tablename__ = "item"

id: Mapped[int] = mapped_column(primary_key=True)
deleted: Mapped[bool] = mapped_column(default=False)
type: Mapped[str]
by: Mapped[Optional[str]] = mapped_column(nullable=True)
time: Mapped[int]
dead: Mapped[bool] = mapped_column(default=False)
parent: Mapped[Optional[int]] = mapped_column(ForeignKey("item.id"), nullable=True)
kids: Mapped[list[int]] = mapped_column(ARRAY(Integer), default=[])
url: Mapped[Optional[str]] = mapped_column(nullable=True)
score: Mapped[Optional[int]] = mapped_column(nullable=True)
title: Mapped[Optional[str]] = mapped_column(nullable=True) # null for comment
text: Mapped[Optional[str]] = mapped_column(nullable=True) # maybe null for story
descendants: Mapped[Optional[int]] = mapped_column(default=0)


class TopStories(Base):
__tablename__ = "top_stories"

story_item_id: Mapped[int] = mapped_column(primary_key=True)
created_at: Mapped[datetime] = mapped_column(
primary_key=True, server_default=func.now()
)


async def connect_db(postgres_uri: str) -> async_sessionmaker[AsyncSession]:
engine = create_async_engine(postgres_uri)

async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)

return async_sessionmaker(engine, expire_on_commit=False)


async def save_items(
*, async_session: async_sessionmaker[AsyncSession], items: list[dict | None]
) -> list[Item]:
if len(items) == 0:
return []
filtered_items: list[dict] = []
for item in items:
# removing failed request items
if item is None:
continue
# remove item with no type
if item.get("type") is None:
continue
# not interested in jobs and polls
if item.get("type") != "story" and item.get("type") != "comment":
continue
if item.get("type") == "story":
# removing spams
if item.get("score", 0) >= 3:
filtered_items.append(item)
if item.get("type") == "comment":
filtered_items.append(item)
hn_items = [
dict(
id=f_item["id"],
type=f_item["type"],
time=f_item["time"],
by=f_item.get("by", None),
dead=f_item.get("dead", False),
deleted=f_item.get("deleted", False),
descendants=f_item.get("descendants", 0),
kids=f_item.get("kids", []),
parent=f_item.get("parent", None),
score=f_item.get("score", None),
text=f_item.get("text", None),
title=f_item.get("title", None),
url=f_item.get("url", None),
)
for f_item in filtered_items
]
async with async_session() as session:
async with session.begin():
stmt = Insert(Item).values(hn_items)
result = await session.scalars(
stmt.on_conflict_do_update(
index_elements=["id"],
set_={
"type": stmt.excluded.type,
"time": stmt.excluded.time,
"by": stmt.excluded.by,
"dead": stmt.excluded.dead,
"deleted": stmt.excluded.deleted,
"descendants": stmt.excluded.descendants,
"kids": stmt.excluded.kids,
"parent": stmt.excluded.parent,
"score": stmt.excluded.score,
"text": stmt.excluded.text,
"title": stmt.excluded.title,
"url": stmt.excluded.url,
},
).returning(Item)
)
return list(result.all())


async def get_item(*, async_session: async_sessionmaker[AsyncSession], item_id: int):
async with async_session() as session:
stmt = select(Item).where(Item.id == item_id)
items = await session.scalars(stmt)
return items.first()


async def save_top_stories(
*, async_session: async_sessionmaker[AsyncSession], stories: list[int]
):
top_stories = [TopStories(story_item_id=story_item_id) for story_item_id in stories]
async with async_session() as session:
async with session.begin():
session.add_all(top_stories)


async def bulk_find_items(
*, async_session: async_sessionmaker[AsyncSession], items: list[int]
):
async with async_session() as session:
async with session.begin():
result = await session.scalars(select(Item).where(Item.id.in_(items)))
return list(result.all())
Loading