andreeaiana · igor17400 · Mar 15, 2024 · Apr 2, 2024 · Apr 10, 2024 · Apr 13, 2024
diff --git a/README.md b/README.md
@@ -136,6 +136,18 @@ Alternatively, configurations can be overridden from the command line, as follow
     python newsreclib/train.py experiment=nrms_mindsmall_plm_supconloss_bertsent data.batch_size=128
 ```
 
+## On some models specifics
+
+### MIND
+Some models may require the publish time of news articles. Considering this, it's crucial to create a dictionary that associates news article IDs with their respective publication times for the MIND dataset. Therefore, please follow these instructions to download the required file:
+
+```
+chmod +x mind_pbt_download.sh
+./mind_pbt_download.sh
+```
+
+NOTE: The file `articles_timeDict_103630.pkl` will be utilized to generate the final version of the file named `updated_articles_publish_time.pkl`. This final version will be created using the method `get_article2clicks`.
+
 ## Features
 
 - **Training**

diff --git a/configs/callbacks/default.yaml b/configs/callbacks/default.yaml
@@ -2,7 +2,8 @@ defaults:
   - model_checkpoint.yaml
   - early_stopping.yaml
   - model_summary.yaml
-  - rich_progress_bar.yaml
+  # - rich_progress_bar.yaml
+  - tqdm_progress_bar.yaml # use tqdm when loggin to a file
   - _self_
 
 model_checkpoint:

diff --git a/configs/callbacks/rich_progress_bar.yaml b/configs/callbacks/rich_progress_bar.yaml
@@ -2,3 +2,5 @@
 
 rich_progress_bar:
   _target_: lightning.pytorch.callbacks.RichProgressBar
+  # TODO: evaluate if this could work https://github.com/Lightning-AI/pytorch-lightning/issues/2189#issuecomment-1913550357
+  # leave: True 
diff --git a/configs/callbacks/tqdm_progress_bar.yaml b/configs/callbacks/tqdm_progress_bar.yaml
@@ -0,0 +1,2 @@
+tqdm_progress_bar:
+  _target_: lightning.pytorch.callbacks.TQDMProgressBar
diff --git a/configs/data/adressa_news.yaml b/configs/data/adressa_news.yaml
@@ -33,6 +33,7 @@ id2index_filenames:
   sentiment2index: "sentiment2index.tsv"
   uid2index: "uid2index.tsv"
   nid2index: "nid2index.tsv"
+  publishtime2index: "publishtime2index.tsv"
 
 use_plm: False
 use_pretrained_categ_embeddings: True

diff --git a/configs/data/adressa_rec.yaml b/configs/data/adressa_rec.yaml
@@ -33,6 +33,7 @@ id2index_filenames:
   sentiment2index: "sentiment2index.tsv"
   uid2index: "uid2index.tsv"
   nid2index: "nid2index.tsv"
+  publishtime2index: "publishtime2index.tsv"
 
 use_plm: False
 use_pretrained_categ_embeddings: True
@@ -64,3 +65,4 @@ batch_size: 64
 num_workers: 0
 pin_memory: True
 drop_last: False
+include_ctr: False
diff --git a/configs/data/mind_rec.yaml b/configs/data/mind_rec.yaml
@@ -66,3 +66,4 @@ batch_size: 64
 num_workers: 0
 pin_memory: True
 drop_last: False
+include_ctr: False
diff --git a/configs/experiment/pprec_adressaoneweek_plm_bprloss_bertsent.yaml b/configs/experiment/pprec_adressaoneweek_plm_bprloss_bertsent.yaml
@@ -0,0 +1,50 @@
+# @package _global_
+
+# to execute this experiment run:
+# python train.py experiment=example
+
+defaults:
+  - override /data: adressa_rec.yaml
+  - override /model: pprec.yaml
+  - override /callbacks: default.yaml
+  - override /logger: many_loggers.yaml
+  - override /trainer: gpu.yaml
+
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+
+tags: ["pprec", "adressa_oneweek", "plm", "bprloss", "bertsent"]
+seed: 42
+
+data:
+  use_plm: True
+  tokenizer_name: roberta-base
+  tokenizer_use_fast: True
+  tokenizer_max_len: 96
+  batch_size: 8
+  include_ctr: True
+  num_workers: 63
+
+model:
+  use_plm: True
+  attributes2encode: ["title"]
+  use_entities: False
+  plm_model: "NbAiLab/nb-bert-base"
+  frozen_layers: [0, 1, 2, 3, 4, 5, 6, 7]
+  text_embed_dim: 768
+  text_num_heads: 16
+  pretrained_entity_embeddings_path: ${paths.data_dir}Adressa_oneweek/transformed_entity_embeddings.npy
+  loss: "BPR_pairwise_loss"
+
+callbacks:
+  early_stopping:
+    patience: 3
+
+trainer:
+  max_epochs: 10
+
+logger:
+  wandb:
+    name: "pprec_adressa_oneweek_plm_bprloss_bertsent_s42"
+    tags: ${tags}
+    group: "adressa"
diff --git a/configs/experiment/pprec_mindsmall_plm_bprloss_bertsent.yaml b/configs/experiment/pprec_mindsmall_plm_bprloss_bertsent.yaml
@@ -0,0 +1,51 @@
+# @package _global_
+
+# to execute this experiment run:
+# python train.py experiment=example
+
+defaults:
+  - override /data: mind_rec_bert_sent.yaml
+  - override /model: pprec.yaml
+  - override /callbacks: default.yaml
+  - override /logger: many_loggers.yaml
+  - override /trainer: gpu.yaml
+
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+
+tags: ["pprec", "mindsmall", "plm", "bprloss", "bertsent"]
+
+seed: 42
+
+data:
+  dataset_size: "small"
+  use_plm: True
+  tokenizer_name: roberta-base
+  tokenizer_use_fast: True
+  tokenizer_max_len: 96
+  batch_size: 8
+  include_ctr: True
+  num_workers: 63
+
+model:
+  use_plm: True
+  plm_model: "roberta-base"
+  frozen_layers: [0, 1, 2, 3, 4, 5, 6, 7]
+  text_embed_dim: 768
+  text_num_heads: 16
+  query_dim: 200
+  pretrained_entity_embeddings_path: ${paths.data_dir}MINDsmall_train/transformed_entity_embeddings.npy
+  loss: "BPR_pairwise_loss"
+
+callbacks:
+  early_stopping:
+    patience: 3
+
+trainer:
+  max_epochs: 10
+
+logger:
+  wandb:
+    name: "pprec_mindsmall_plm_bprloss_bertsent_s42"
+    tags: ${tags}
+    group: "mind"
diff --git a/configs/experiment/pprec_mindsmall_plm_celoss_bertsent.yaml b/configs/experiment/pprec_mindsmall_plm_celoss_bertsent.yaml
@@ -0,0 +1,51 @@
+# @package _global_
+
+# to execute this experiment run:
+# python train.py experiment=example
+
+defaults:
+  - override /data: mind_rec_bert_sent.yaml
+  - override /model: pprec.yaml
+  - override /callbacks: default.yaml
+  - override /logger: many_loggers.yaml
+  - override /trainer: gpu.yaml
+
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+
+tags: ["pprec", "mindsmall", "plm", "celoss", "bertsent"]
+
+seed: 42
+
+data:
+  dataset_size: "small"
+  use_plm: True
+  tokenizer_name: roberta-base
+  tokenizer_use_fast: True
+  tokenizer_max_len: 96
+  batch_size: 8
+  include_ctr: True
+  num_workers: 18
+
+model:
+  use_plm: True
+  plm_model: "roberta-base"
+  frozen_layers: [0, 1, 2, 3, 4, 5, 6, 7]
+  text_embed_dim: 768
+  text_num_heads: 16
+  query_dim: 200
+  pop_num_embeddings: 1500 # < 1100 causes an error
+  pretrained_entity_embeddings_path: ${paths.data_dir}MINDsmall_train/transformed_entity_embeddings.npy
+
+callbacks:
+  early_stopping:
+    patience: 3
+
+trainer:
+  max_epochs: 10
+
+logger:
+  wandb:
+    name: "pprec_mindsmall_plm_celoss_bertsent_s42"
+    tags: ${tags}
+    group: "mind"
diff --git a/configs/experiment/pprec_mindsmall_pretrainedemb_celoss_bertsent.yaml b/configs/experiment/pprec_mindsmall_pretrainedemb_celoss_bertsent.yaml
@@ -0,0 +1,49 @@
+# @package _global_
+
+# to execute this experiment run:
+# python train.py experiment=example
+
+defaults:
+  - override /data: mind_rec_bert_sent.yaml
+  - override /model: pprec.yaml
+  - override /callbacks: default.yaml
+  - override /logger: many_loggers.yaml
+  - override /trainer: gpu.yaml
+
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+
+tags: ["pprec", "mindsmall", "pretrainedemb", "celoss", "bertsent"]
+
+seed: 42
+
+data:
+  dataset_size: "small"
+  use_plm: True
+  tokenizer_name: roberta-base
+  tokenizer_use_fast: True
+  tokenizer_max_len: 96
+  batch_size: 8
+  include_ctr: True
+
+model:
+  use_plm: False
+  embed_dim: 300
+  num_heads: 15
+  query_dim: 200
+  dropout_probability: 0.2
+  pretrained_entity_embeddings_path: ${paths.data_dir}MINDsmall_train/transformed_entity_embeddings.npy
+  pretrained_embeddings_path: ${paths.data_dir}MINDsmall_train/transformed_word_embeddings.npy
+
+callbacks:
+  early_stopping:
+    patience: 5
+
+trainer:
+  max_epochs: 20
+
+logger:
+  wandb:
+    name: "pprec_mindsmall_pretrainedemb_celoss_bertsent_s42"
+    tags: ${tags}
+    group: "mind"
diff --git a/configs/logger/many_loggers.yaml b/configs/logger/many_loggers.yaml
@@ -2,5 +2,5 @@
 
 defaults:
   - csv.yaml
-  #  - tensorboard.yaml
+  # - tensorboard.yaml
   - wandb.yaml
diff --git a/configs/model/pprec.yaml b/configs/model/pprec.yaml
@@ -0,0 +1,64 @@
+_target_: newsreclib.models.fair_rec.pprec_module.PPRECModule
+
+# training strategy
+dual_loss_training: False
+dual_loss_coef: null
+loss: "BPR_pairwise_loss"
+temperature: None
+
+# news encoder
+dataset_attributes: ${data.dataset_attributes}
+attributes2encode: ["title", "title_entities"]
+use_plm: False
+use_entities: True
+
+# text encoder
+# parameters from the code https://github.com/taoqi98/PP-Rec and from the actual paper
+pretrained_embeddings_path: null
+plm_model: null
+frozen_layers: null
+query_dim: 200
+dropout_probability: 0.2
+pop_num_embeddings: 500
+pop_embedding_dim: 400
+hidden_dim_pop_predictor: 100
+rec_num_emb_pop_predictor: 1500
+rec_emb_dim_pop_predictor: 100
+text_embed_dim: 300
+text_num_heads: 20
+entity_embed_dim: 100
+entity_num_heads: 20
+categ_embed_dim: 100
+cpja_hidden_dim: 100
+
+# outputs
+outputs:
+  train: ["preds", "targets", "cand_news_size"]
+  val: ["preds", "targets", "cand_news_size"]
+  test:
+    [
+      "preds",
+      "targets",
+      "cand_news_size",
+      "hist_news_size",
+      "target_categories",
+      "target_sentiments",
+      "hist_categories",
+      "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
+    ]
+
+# evaluation
+top_k_list: [5, 10]
+num_categ_classes: 18
+num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
+
+optimizer:
+  _target_: torch.optim.Adam
+  _partial_: true
+  lr: 0.0001
+
+scheduler: null
diff --git a/configs/train.yaml b/configs/train.yaml
@@ -46,6 +46,7 @@ test: True
 compile: False # Torchmetrics does not yet support pytorch 2.0 compile function because of dynamo lack of support for 'setattr'
 
 # simply provide checkpoint path to resume training
+# ckpt_path: logs/train/runs/nrms_mindsmall_pretrainedemb_celoss_bertsent_s42/2024-03-08_07-48-40/checkpoints/epoch_016.ckpt
 ckpt_path: null
 
 # seed for random number generators in pytorch, numpy and python.random

diff --git a/mind_pbt_download.sh b/mind_pbt_download.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# URL of the file to download
+URL="https://github.com/summmeer/session-based-news-recommendation/files/9467559/articles_timeDict_103630.pkl.gz"
+# Name of the file to save after download
+FILENAME="articles_timeDict_103630.pkl.gz"
+# Destination directory
+DEST_DIR="data"
+
+# Download the file using curl
+curl -L $URL -o $FILENAME
+
+# Check if the download was successful
+if [ -f "$FILENAME" ]; then
+    # Unzip the file
+    gunzip -f $FILENAME
+    echo "File downloaded and unzipped successfully."
+    # Check if the data directory exists
+    if [ ! -d "$DEST_DIR" ]; then
+        # Create the directory if it doesn't exist
+        mkdir "$DEST_DIR"
+    fi
+    # Move the file to the data directory
+    mv "${FILENAME%.gz}" "$DEST_DIR/"
+    echo "File moved to $DEST_DIR"
+else
+    echo "Failed to download the file."
+fi
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		tqdm_progress_bar:
		_target_: lightning.pytorch.callbacks.TQDMProgressBar