Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Add contrib.rand_zipfian #9747

Merged
merged 18 commits into from
Feb 23, 2018
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 73 additions & 1 deletion python/mxnet/ndarray/contrib.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,81 @@
# coding: utf-8
# pylint: disable=wildcard-import, unused-wildcard-import
"""Contrib NDArray API of MXNet."""
import math
from ..context import current_context
from ..random import uniform
try:
from .gen_contrib import *
except ImportError:
pass

__all__ = []
__all__ = ["rand_zipfian"]

# pylint: disable=line-too-long
def rand_zipfian(true_classes, num_sampled, range_max, ctx=None):
"""Draw random samples from an approximately log-uniform or Zipfian distribution.

This operation randomly samples *num_sampled* candidates the range of integers [0, range_max).
The elements of sampled_candidates are drawn with replacement from the base distribution.

The base distribution for this operator is an approximately log-uniform or Zipfian distribution:

P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)

This sampler is useful when the true classes approximately follow such a distribution.
For example, if the classes represent words in a lexicon sorted in decreasing order of \
frequency. If your classes are not ordered by decreasing frequency, do not use this op.

Additionaly, it also returns the number of times each of the \
true classes and the sampled classes is expected to occur.

Parameters
----------
true_classes : NDArray
A 1-D NDArray of the target classes.
num_sampled: int
The number of classes to randomly sample.
range_max: int
The number of possible classes.
ctx : Context
Device context of output. Default is current context. Overridden by
`mu.context` when `mu` is an NDArray.

Returns
-------
samples: NDArray
The sampled candidate classes in 1-D `int64` dtype.
expected_count_true: NDArray
The expected count for true classes in 1-D `float64` dtype.
expected_count_sample: NDArray
The expected count for sampled candidates in 1-D `float64` dtype.

Examples
--------
>>> true_cls = mx.nd.array([3])
>>> samples, exp_count_true, exp_count_sample = mx.nd.contrib.rand_zipfian(true_cls, 4, 5)
>>> samples
[1 3 3 3]
<NDArray 4 @cpu(0)>
>>> exp_count_true
[ 0.12453879]
<NDArray 1 @cpu(0)>
>>> exp_count_sample
[ 0.22629439 0.12453879 0.12453879 0.12453879]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The example output looks suspicious as it does not sum up to 1.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I've misunderstood the term. It should be correct.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel it's suspicious at first glance because the exp_count of 1 is larger than the exp_count of 3. However, the sampling result show that 3 is much more often then 1. We need to sample multiple times and test if the empirical expectation matches the true expectation.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's just a coincident for the first 5 samples. If I sample 50 times, it returns:


1
3
3
3
2
0
0
0
0
1
3
1
1
3
0
2
0
4
0
3
1
3
1
2
2
1
1
2
0
1
0
2
0
0
0
0
0
0
4
1
1
4
0
4
2
0
0
2
1
0

0's = 19

1's = 12

2's = 8

3's = 7

4's = 4

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, looks good

<NDArray 4 @cpu(0)>
"""
if ctx is None:
ctx = current_context()
log_range = math.log(range_max + 1)
rand = uniform(0, log_range, shape=(num_sampled,), dtype='float64', ctx=ctx)
# make sure sampled_classes are in the range of [0, range_max)
sampled_classes = (rand.exp() - 1).astype('int64') % range_max

true_cls = true_classes.as_in_context(ctx).astype('float64')
expected_count_true = ((true_cls + 2.0) / (true_cls + 1.0)).log() / log_range * num_sampled
# cast sampled classes to fp64 to avoid interget division
sampled_cls_fp64 = sampled_classes.astype('float64')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is the output always float64?

Copy link
Member Author

@eric-haibin-lin eric-haibin-lin Feb 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

64-bit is adopted because this sampler is usually used for extremely large number of classes. Returned samples are always actually always in int64. The fp64 here is used to calculate the probability of a particular classes. (Limited precision of fp32 treat 50M - 1 and 50M - 2 as the same number, yielding nan when taking the log)

expected_prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range
expected_count_sampled = expected_prob_sampled * num_sampled
return sampled_classes, expected_count_true, expected_count_sampled
# pylint: enable=line-too-long
69 changes: 68 additions & 1 deletion python/mxnet/symbol/contrib.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,76 @@
# coding: utf-8
# pylint: disable=wildcard-import, unused-wildcard-import
"""Contrib Symbol API of MXNet."""
import math
from .random import uniform
from .symbol import Symbol
try:
from .gen_contrib import *
except ImportError:
pass

__all__ = []
__all__ = ["rand_zipfian"]

def rand_zipfian(true_classes, num_sampled, range_max):
"""Draw random samples from an approximately log-uniform or Zipfian distribution.

This operation randomly samples *num_sampled* candidates the range of integers [0, range_max).
The elements of sampled_candidates are drawn with replacement from the base distribution.

The base distribution for this operator is an approximately log-uniform or Zipfian distribution:

P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)

This sampler is useful when the true classes approximately follow such a distribution.
For example, if the classes represent words in a lexicon sorted in decreasing order of \
frequency. If your classes are not ordered by decreasing frequency, do not use this op.

Additionaly, it also returns the number of times each of the \
true classes and the sampled classes is expected to occur.

Parameters
----------
true_classes : Symbol
The target classes in 1-D.
num_sampled: int
The number of classes to randomly sample.
range_max: int
The number of possible classes.

Returns
-------
samples: Symbol
The sampled candidate classes in 1-D `int64` dtype.
expected_count_true: Symbol
The expected count for true classes in 1-D `float64` dtype.
expected_count_sample: Symbol
The expected count for sampled candidates in 1-D `float64` dtype.

Examples
--------
>>> true_cls = mx.nd.array([3])
>>> samples, exp_count_true, exp_count_sample = mx.nd.contrib.rand_zipfian(true_cls, 4, 5)
>>> samples
[1 3 3 3]
<NDArray 4 @cpu(0)>
>>> exp_count_true
[ 0.12453879]
<NDArray 1 @cpu(0)>
>>> exp_count_sample
[ 0.22629439 0.12453879 0.12453879 0.12453879]
<NDArray 4 @cpu(0)>
"""
assert(isinstance(true_classes, Symbol)), "unexpected type %s" % type(true_classes)
log_range = math.log(range_max + 1)
rand = uniform(0, log_range, shape=(num_sampled,), dtype='float64')
# make sure sampled_classes are in the range of [0, range_max)
sampled_classes = (rand.exp() - 1).astype('int64') % range_max

true_classes = true_classes.astype('float64')
expected_prob_true = ((true_classes + 2.0) / (true_classes + 1.0)).log() / log_range
expected_count_true = expected_prob_true * num_sampled
# cast sampled classes to fp64 to avoid interget division
sampled_cls_fp64 = sampled_classes.astype('float64')
expected_prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range
expected_count_sampled = expected_prob_sampled * num_sampled
return [sampled_classes, expected_count_true, expected_count_sampled]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why a list?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, I forgot to update this

30 changes: 30 additions & 0 deletions tests/python/unittest/test_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,36 @@ def check_data(a, b):
for j in range(i+1, num_seeds):
check_data(data[i],data[j])

@with_seed()
def test_zipfian_generator():
# dummy true classes
num_true = 5
num_sampled = 1000
range_max = 20

def compute_expected_prob():
# P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)
classes = mx.nd.arange(0, range_max)
expected_counts = ((classes + 2).log() - (classes + 1).log()) / np.log(range_max + 1)
return expected_counts

exp_cnt = compute_expected_prob() * num_sampled

# test ndarray
true_classes = mx.nd.random.uniform(0, range_max, shape=(num_true,)).astype('int32')
sampled_classes, exp_cnt_true, exp_cnt_sampled = mx.nd.contrib.rand_zipfian(true_classes, num_sampled, range_max)
mx.test_utils.assert_almost_equal(exp_cnt_sampled.asnumpy(), exp_cnt[sampled_classes].asnumpy(), rtol=1e-1, atol=1e-2)
mx.test_utils.assert_almost_equal(exp_cnt_true.asnumpy(), exp_cnt[true_classes].asnumpy(), rtol=1e-1, atol=1e-2)

# test symbol
true_classes_var = mx.sym.var('true_classes')
outputs = mx.sym.contrib.rand_zipfian(true_classes_var, num_sampled, range_max)
outputs = mx.sym.Group(outputs)
executor = outputs.bind(mx.context.current_context(), {'true_classes' : true_classes})
executor.forward()
sampled_classes, exp_cnt_true, exp_cnt_sampled = executor.outputs
mx.test_utils.assert_almost_equal(exp_cnt_sampled.asnumpy(), exp_cnt[sampled_classes].asnumpy(), rtol=1e-1, atol=1e-2)
mx.test_utils.assert_almost_equal(exp_cnt_true.asnumpy(), exp_cnt[true_classes].asnumpy(), rtol=1e-1, atol=1e-2)

if __name__ == '__main__':
import nose
Expand Down