-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_data.py
124 lines (100 loc) · 5.41 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pickle
from sklearn.datasets import fetch_openml
from sklearn.utils import shuffle
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd
import os
import random
from utlis import load_pkl
import torch
import torchvision
from torchvision import datasets, transforms
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
import scipy.cluster.hierarchy as hcluster
import time
class twitter_IM:
def __init__(self, Num_Seeds=5, Budget=100, Num_Runs=5, Num_cluster=50):
os.chdir('/Users/yuting/PycharmProjects/bandit/IM/data_processing/')
# / home / svu / yt.f / data /
self.num_user = Num_cluster # cluster number
self.tweets = pd.read_csv("6_date_sorted_influencer_10context_data_"+str(Num_cluster)+"clustered.csv", delimiter=";")
self.mapping_dict = load_pkl("mapping_"+str(Num_cluster)+".pkl")
with open('influencer_embedding.pkl','rb') as f_emb:
influencer_emb = pickle.load(f_emb)
self.INFLUENCERS = [int(i) for i in list(influencer_emb.keys())]
self.influencer_emb = np.array(list(influencer_emb.values()))
self.n_arm = len(self.INFLUENCERS)
self.MAX_NA = float(self.tweets.new_activations.max()) # maximum new acticated nodes
self.MAX_LOG_NA = np.log(self.tweets.new_activations.max())
# mapping str of set to set
self.tweets.regular_node_set_unique = self.tweets.regular_node_set_unique.apply(
lambda txt: eval(txt))
self.tweets.regular_node_set_grouped = self.tweets.regular_node_set_grouped.apply(
lambda txt: eval(txt))
# all the contexts
self.twitter_contexts = self.tweets.context
# needed as vector for the regressions
self.twitter_contexts = list(map(lambda context: np.array(context.split(), dtype=float), self.twitter_contexts)) # list of context array
self.dim = 2*len(self.twitter_contexts[0])
np.random.seed(100)
self.seeds = dict.fromkeys(list(set([np.random.randint(1000) for _ in np.arange(Num_Seeds + 10)]))[:Num_Seeds])
for seed in self.seeds.keys():
np.random.seed(seed)
# select the contexts for the running campaigns
context_idx = list(set([np.random.randint(0, len(self.twitter_contexts)) for _ in np.arange(Budget*Num_Runs + 100)]))[
:Budget*Num_Runs]
self.seeds[seed] = [self.twitter_contexts[idx] for idx in context_idx]
def generate(self, seed):
campaign = []
contexts = self.seeds[seed]
for context in contexts:
campaign_temp = self.tweets.loc[self.tweets.context == ' '.join(context[:self.dim + 1].astype(str)), :]
# if campaign_temp.size > 0:
# campaign_temp = campaign_temp.sample()
campaign.append(campaign_temp)
return contexts, campaign
class weibo_IM:
def __init__(self, Num_Seeds=5, Budget=100, Num_Runs=5, Num_cluster=50):
os.chdir('/Users/yuting/PycharmProjects/bandit/IM/data_processing/')
# / home / svu / yt.f / data /
self.num_user = Num_cluster # cluster number
self.tweets = pd.read_csv("weibo_"+str(Num_cluster)+"clustered.csv", delimiter=";")
self.mapping_dict = load_pkl("mapping_"+str(Num_cluster)+"_weibo.pkl")
with open('influencer_embedding_weibo.pkl','rb') as f_emb:
influencer_emb = pickle.load(f_emb)
self.INFLUENCERS = [int(i) for i in list(influencer_emb.keys())]
self.influencer_emb = np.array(list(influencer_emb.values()))
self.n_arm = len(self.INFLUENCERS) # number of influencers -> number of arms to be selected
self.MAX_NA = float(self.tweets.new_activations.max()) # maximum new acticated nodes
self.MAX_LOG_NA = np.log(self.tweets.new_activations.max())
# mapping str of set to set
self.tweets.regular_node_set_unique = self.tweets.regular_node_set_unique.apply(
lambda txt: eval(txt))
self.tweets.regular_node_set_grouped = self.tweets.regular_node_set_grouped.apply(
lambda txt: eval(txt))
# all the contexts
self.twitter_contexts = self.tweets.context
# needed as vector for the regressions
self.twitter_contexts = list(map(lambda context: np.array(context.split(), dtype=float), self.twitter_contexts)) # list of context array
self.dim = 2*len(self.twitter_contexts[0])
np.random.seed(100)
self.seeds = dict.fromkeys(list(set([np.random.randint(1000) for _ in np.arange(Num_Seeds + 10)]))[:Num_Seeds])
for seed in self.seeds.keys():
np.random.seed(seed)
# select the contexts for the running campaigns
context_idx = list(set([np.random.randint(0, len(self.twitter_contexts)) for _ in np.arange(Budget*Num_Runs + 100)]))[
:Budget*Num_Runs]
self.seeds[seed] = [self.twitter_contexts[idx] for idx in context_idx]
def generate(self, seed):
campaign = []
contexts = self.seeds[seed]
for context in contexts:
campaign_temp = self.tweets.loc[self.tweets.context == ' '.join(context[:self.dim + 1].astype(str)), :]
# if campaign_temp.size > 0:
# campaign_temp = campaign_temp.sample()
campaign.append(campaign_temp)
return contexts, campaign