Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Add contrib.rand_zipfian #9747

Merged
merged 18 commits into from
Feb 23, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions python/mxnet/ndarray/contrib.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,82 @@
# coding: utf-8
# pylint: disable=wildcard-import, unused-wildcard-import
"""Contrib NDArray API of MXNet."""
import math
from ..context import current_context
from ..random import uniform
try:
from .gen_contrib import *
except ImportError:
pass

__all__ = []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to include it in all


def rand_log_uniform(true_classes, num_sampled, range_max, ctx=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it should not be called as rand_log_uniform because LogUniform has a specific meaning. Should be called something like rand_zipfian, or log_uniform_candidate_sampler like in TF.

"""Draw random samples from an approximately log-uniform or Zipfian distribution.

This operation randomly samples *num_sampled* candidates the range of integers [0, range_max).
The elements of sampled_candidates are drawn with replacement from the base distribution.

The base distribution for this operation is an approximately log-uniform or Zipfian distribution:

P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)

This sampler is useful when the true classes approximately follow such a distribution.
For example, if the classes represent words in a lexicon sorted in decreasing order of \
frequency. If your classes are not ordered by decreasing frequency, do not use this op.

Additionaly, it also returns the number of times each of the \
true classes and the sampled classes is expected to occur.

Parameters
----------
true_classes : NDArray
A 1-D NDArray of the target classes.
num_sampled: int
The number of classes to randomly sample.
range_max: int
The number of possible classes.
ctx : Context
Device context of output. Default is current context. Overridden by
`mu.context` when `mu` is an NDArray.

Returns
-------
list of NDArrays
A 1-D `int64` `NDArray` for sampled candidate classes, a 1-D `float64` `NDArray` for \
the expected count for true classes, and a 1-D `float64` `NDArray` for the \
expected count for sampled classes.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to write the docstring as:

Returns
--------
samples : NDArray
    A 1-D `int64` `NDArray` for sampled candidate classes
exp_count_true : NDArray
   ...
exp_count_sample : NDArray
   ...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do


Examples
--------
>>> true_cls = mx.nd.array[3]
>>> samples, exp_count_true, exp_count_sample = mx.nd.contrib.log_uniform(true_cls, 4, 5)
>>> samples
[[ 4. 0.]
[ 3. 2.]
[ 6. 2.]]
<NDArray 3x2 @cpu(0)>
>>> exp_count_true
[[ 4. 0.]
[ 3. 2.]
[ 6. 2.]]
<NDArray 3x2 @cpu(0)>
>>> exp_count_sample
[[ 4. 0.]
[ 3. 2.]
[ 6. 2.]]
<NDArray 3x2 @cpu(0)>
"""
if ctx is None:
ctx = current_context()
log_range = math.log(range_max + 1)
rand = uniform(0, log_range, shape=(num_sampled,), dtype='float64', ctx=ctx)
# make sure sampled_classes are in the range of [0, range_max)
sampled_classes = (rand.exp() - 1).astype('int64') % range_max

true_classes = true_classes.as_in_context(ctx).astype('float64')
expected_count_true = ((true_classes + 2.0) / (true_classes + 1.0)).log() / log_range
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it should be expected_count_true = ((true_classes + 2.0) / (true_classes + 1.0)).log() / log_range * num_sampled. Otherwise it should be called something like prob_true_class.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, I should either multiply it by num_sampled or change the name. Will do an update.

# cast sampled classes to fp64 to avoid interget division
sampled_cls_fp64 = sampled_classes.astype('float64')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is the output always float64?

Copy link
Member Author

@eric-haibin-lin eric-haibin-lin Feb 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

64-bit is adopted because this sampler is usually used for extremely large number of classes. Returned samples are always actually always in int64. The fp64 here is used to calculate the probability of a particular classes. (Limited precision of fp32 treat 50M - 1 and 50M - 2 as the same number, yielding nan when taking the log)

expected_count_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range
return [sampled_classes, expected_count_true, expected_count_sampled]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to return a list here.

1 change: 1 addition & 0 deletions python/mxnet/ndarray/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

"""Random distribution generator NDArray API of MXNet."""

import math
from ..base import numeric_types, _Null
from ..context import current_context
from . import _internal
Expand Down
75 changes: 75 additions & 0 deletions python/mxnet/symbol/contrib.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,84 @@
# coding: utf-8
# pylint: disable=wildcard-import, unused-wildcard-import
"""Contrib Symbol API of MXNet."""
import math
from ..context import current_context
from .random import uniform
from .symbol import Symbol
try:
from .gen_contrib import *
except ImportError:
pass

__all__ = []

def rand_log_uniform(true_classes, num_sampled, range_max, ctx=None):
"""Draw random samples from an approximately log-uniform or Zipfian distribution.

This operation randomly samples *num_sampled* candidates the range of integers [0, range_max).
The elements of sampled_candidates are drawn with replacement from the base distribution.

The base distribution for this operation is an approximately log-uniform or Zipfian distribution:

P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)

This sampler is useful when the true classes approximately follow such a distribution.
For example, if the classes represent words in a lexicon sorted in decreasing order of \
frequency. If your classes are not ordered by decreasing frequency, do not use this op.

Additionaly, it also returns the number of times each of the \
true classes and the sampled classes is expected to occur.

Parameters
----------
true_classes : Symbol
The target classes in 1-D.
num_sampled: int
The number of classes to randomly sample.
range_max: int
The number of possible classes.
ctx : Context
Device context of output. Default is current context. Overridden by
`mu.context` when `mu` is an NDArray.

Returns
-------
list of Symbols
A 1-D `int64` `Symbol` for sampled candidate classes, a 1-D `float64` `Symbol` for \
the expected count for true classes, and a 1-D `float64` `Symbol` for the \
expected count for sampled classes.

Examples
--------
>>> true_cls = mx.nd.array[3]
>>> samples, exp_count_true, exp_count_sample = mx.nd.contrib.log_uniform(true_cls, 4, 5)
>>> samples
[[ 4. 0.]
[ 3. 2.]
[ 6. 2.]]
<NDArray 3x2 @cpu(0)>
>>> exp_count_true
[[ 4. 0.]
[ 3. 2.]
[ 6. 2.]]
<NDArray 3x2 @cpu(0)>
>>> exp_count_sample
[[ 4. 0.]
[ 3. 2.]
[ 6. 2.]]
<NDArray 3x2 @cpu(0)>
"""
assert(isinstance(true_classes, Symbol)), "unexpected type %s" % type(true_classes)
if ctx is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

symbol doesn't need ctx

ctx = current_context()
log_range = math.log(range_max + 1)
rand = uniform(0, log_range, shape=(num_sampled,), dtype='float64', ctx=ctx)
# make sure sampled_classes are in the range of [0, range_max)
sampled_classes = (rand.exp() - 1).astype('int64') % range_max

true_classes = true_classes.astype('float64')
expected_count_true = ((true_classes + 2.0) / (true_classes + 1.0)).log() / log_range
# cast sampled classes to fp64 to avoid interget division
sampled_cls_fp64 = sampled_classes.astype('float64')
expected_count_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range
return [sampled_classes, expected_count_true, expected_count_sampled]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why a list?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, I forgot to update this

1 change: 1 addition & 0 deletions src/operator/tensor/elemwise_unary_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,7 @@ struct CastParam : public dmlc::Parameter<CastParam> {
.add_enum("float16", mshadow::kFloat16)
.add_enum("uint8", mshadow::kUint8)
.add_enum("int32", mshadow::kInt32)
.add_enum("int64", mshadow::kInt64)
.describe("Output data type.");
}
};
Expand Down
30 changes: 30 additions & 0 deletions tests/python/unittest/test_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,36 @@ def test_multinomial_generator():
for _ in range(10)])
verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)

def test_log_uniform_generator():
mx.random.seed(0)
# dummy true classes
num_true = 5
num_sampled = 1000
range_max = 20

def compute_expected_counts():
# P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)
classes = mx.nd.arange(0, range_max)
expected_counts = ((classes + 2).log() - (classes + 1).log()) / np.log(range_max + 1)
return expected_counts

exp_cnt = compute_expected_counts()

# test ndarray
true_classes = mx.nd.random.uniform(0, range_max, shape=(num_true,)).astype('int32')
sampled_classes, exp_cnt_true, exp_cnt_sampled = mx.nd.contrib.rand_log_uniform(true_classes, num_sampled, range_max)
mx.test_utils.assert_almost_equal(exp_cnt_sampled.asnumpy(), exp_cnt[sampled_classes].asnumpy(), rtol=1e-1, atol=1e-2)
mx.test_utils.assert_almost_equal(exp_cnt_true.asnumpy(), exp_cnt[true_classes].asnumpy(), rtol=1e-1, atol=1e-2)

# test symbol
true_classes_var = mx.sym.var('true_classes')
outputs = mx.sym.contrib.rand_log_uniform(true_classes_var, num_sampled, range_max)
outputs = mx.sym.Group(outputs)
executor = outputs.bind(mx.context.current_context(), {'true_classes' : true_classes})
executor.forward()
sampled_classes, exp_cnt_true, exp_cnt_sampled = executor.outputs
mx.test_utils.assert_almost_equal(exp_cnt_sampled.asnumpy(), exp_cnt[sampled_classes].asnumpy(), rtol=1e-1, atol=1e-2)
mx.test_utils.assert_almost_equal(exp_cnt_true.asnumpy(), exp_cnt[true_classes].asnumpy(), rtol=1e-1, atol=1e-2)

if __name__ == '__main__':
import nose
Expand Down