Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Two and tree #1

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
c0.2.0, 19.01.2015 -- Py2 and Py3 compatible
v0.1.1, 19.04.2013 -- Data file.
v0.1.0, 17.04.2013 -- Initial release.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@
license='GPLv3',
description='Get the gender from first name.',
long_description=open('README.rst').read(),
requires=['six (>=1.9)']
)
1 change: 1 addition & 0 deletions sexmachine/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'repsdorph'
36 changes: 36 additions & 0 deletions sexmachine/data/test_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
import unittest
import sexmachine.detector as d


class TestDetector(unittest.TestCase):

def setUp(self):
self.case = d.Detector()
self.incase = d.Detector(case_sensitive=False)

def test_gender(self):
self.assertEqual(self.case.get_gender(u"Bob"), "male")
self.assertEqual(self.case.get_gender(u"Sally"), "female")
self.assertEqual(self.case.get_gender(u"Pauley"), "andy")

def test_unicode(self):
self.assertEqual(self.case.get_gender(u"Álfrún"), "female")
self.assertEqual(self.case.get_gender(u"Ayşe"), "female")
self.assertEqual(self.case.get_gender(u"Gavriliţă"), "female")
self.assertEqual(self.case.get_gender(u"İsmet"), "male")
self.assertEqual(self.case.get_gender(u"Snæbjörn"), "male")

def test_country(self):
self.assertEqual(self.case.get_gender(u"Jamie"), "mostly_female")
self.assertEqual(self.case.get_gender(u"Jamie", "great_britain"),
"mostly_male")

def test_case(self):
self.assertEqual(self.incase.get_gender(u"sally"), "female")
self.assertEqual(self.incase.get_gender(u"Sally"), "female")
self.assertEqual(self.incase.get_gender(u"aydın"), "male")
self.assertEqual(self.incase.get_gender(u"Aydın"), "male")

if __name__ == '__main__':
unittest.main()
File renamed without changes.
122 changes: 89 additions & 33 deletions sexmachine/detector.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,68 @@
"""Module for getting a gender based on name and optional country."""

import os.path
import codecs

from codecs import open as copen
from six import viewkeys

from .mapping import map_name


class NoCountryError(Exception):
"""Raised when non-supported country is queried"""

"""Raised when non-supported country is queried."""
pass


class Detector:
"""Get gender by first name"""

COUNTRIES = u"""great_britain ireland usa italy malta portugal spain france
belgium luxembourg the_netherlands east_frisia germany austria
swiss iceland denmark norway sweden finland estonia latvia
lithuania poland czech_republic slovakia hungary romania
bulgaria bosniaand croatia kosovo macedonia montenegro serbia
slovenia albania greece russia belarus moldova ukraine armenia
azerbaijan georgia the_stans turkey arabia israel china india
japan korea vietnam other_countries
"""Get gender by first name."""

COUNTRIES = """great_britain ireland usa italy malta portugal spain france
belgium luxembourg the_netherlands east_frisia germany
austria swiss iceland denmark norway sweden finland estonia
latvia lithuania poland czech_republic slovakia hungary
romania bulgaria bosniaand croatia kosovo macedonia
montenegro serbia slovenia albania greece russia belarus
moldova ukraine armenia azerbaijan georgia the_stans turkey
arabia israel china india japan korea vietnam other_countries
""".split()

def __init__(self,
case_sensitive=True,
unknown_value=u"andy"):
unknown_value="andy"):

"""Creates a detector parsing given data file.

"""Creates a detector parsing given data file"""
Args:
case_sensitive (''bool''): Whether it's case sensitive or not.
unknown_value (''str''): The value returned when unknown (not m/f).
"""
self.case_sensitive = case_sensitive
self.unknown_value = unknown_value
self._parse(os.path.join(os.path.dirname(__file__), "data/nam_dict.txt"))
self._parse(os.path.join(os.path.dirname(__file__),
"data/nam_dict.txt"
)
)

def _parse(self, filename):
"""Opens data file and for each line, calls _eat_name_line"""
"""Opens data file and for each line, calls _eat_name_line.

Args:
filename (''str''): Filename for the data file.
"""
self.names = {}
with codecs.open(filename, encoding="iso8859-1") as f:
with copen(filename, encoding="iso8859-1") as f:
for line in f:
if any(map(lambda c: 128 < ord(c) < 160, line)):
line = line.encode("iso8859-1").decode("windows-1252")
line = line

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice.

self._eat_name_line(line.strip())

def _eat_name_line(self, line):
"""Parses one line of data file"""
"""Parses one line of data file.

Args:
line (''str''): A line form the data file.
"""
if line[0] not in "#=":
parts = line.split()
country_values = line[30:-1]
Expand All @@ -49,55 +71,89 @@ def _eat_name_line(self, line):
name = name.lower()

if parts[0] == "M":
self._set(name, u"male", country_values)
self._set(name, "male", country_values)
elif parts[0] == "1M" or parts[0] == "?M":
self._set(name, u"mostly_male", country_values)
self._set(name, "mostly_male", country_values)
elif parts[0] == "F":
self._set(name, u"female", country_values)
self._set(name, "female", country_values)
elif parts[0] == "1F" or parts[0] == "?F":
self._set(name, u"mostly_female", country_values)
self._set(name, "mostly_female", country_values)
elif parts[0] == "?":
self._set(name, self.unknown_value, country_values)
else:
raise "Not sure what to do with a sex of %s" % parts[0]

def _set(self, name, gender, country_values):
"""Sets gender and relevant country values for names dictionary of detector"""
"""Set gender and country values for names dictionary of detector.

Args:
name (''str''): Name of the person.
gender (''str''): The gender of the person
country_values (''str''): The country.
"""
if '+' in name:
for replacement in ['', ' ', '-']:
self._set(name.replace('+', replacement), gender, country_values)
self._set(name.replace('+', replacement),
gender,
country_values
)
else:
if name not in self.names:
self.names[name] = {}
self.names[name][gender] = country_values

def _most_popular_gender(self, name, counter):
"""Finds the most popular gender for the given name counting by given counter"""
"""Find the most popular gender for the given name by given counter

Args:
name (''str''): The Name.
counter (''int''): The number given for popularity of the name
based on the country. 1 (=rare) to 13 (=extremely common). See
the link in readme about the data file for more information.
Return:
Best value for a name.
"""
if name not in self.names:
return self.unknown_value

max_count, max_tie = (0, 0)
best = self.names[name].keys()[0]
best = [a for a in viewkeys(self.names[name])][0]
for gender, country_values in self.names[name].items():
count, tie = counter(country_values)
if count > max_count or (count == max_count and tie > max_tie):
max_count, max_tie, best = count, tie, gender

return best if max_count > 0 else self.unknown_value

def counter(self, country_values):
"""Find the value for the country values

Args:
country_values (''List of chars''):
Return:
tuple with length of the country values and a value
"""
country_values = [ord(a) for a in country_values]
return (len(country_values),
sum(map(lambda c: c > 64 and c-55 or c-48, country_values)))

def get_gender(self, name, country=None):
"""Returns best gender for the given name and country pair"""
"""Returns best gender for the given name and country pair

Args:
name (''str''): The name to look up
Country (''str''): Name of a country or None
Return:
The best gender for the given name and country pair
"""
if not self.case_sensitive:
name = name.lower()

if name not in self.names:
return self.unknown_value
elif not country:
def counter(country_values):
country_values = map(ord, country_values.replace(" ", ""))
return (len(country_values),
sum(map(lambda c: c > 64 and c-55 or c-48, country_values)))
return self._most_popular_gender(name, counter)

return self._most_popular_gender(name, self.counter)
elif country in self.__class__.COUNTRIES:
index = self.__class__.COUNTRIES.index(country)
counter = lambda e: (ord(e[index])-32, 0)
Expand Down
17 changes: 16 additions & 1 deletion sexmachine/mapping.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
"""Mapper for special chars."""

# -*- coding: utf-8 -*-
import six
mappings = ((256, ["<A/>"]),
(257, ["<a/>"]),
(258, ["<Â>"]),
Expand Down Expand Up @@ -70,7 +73,19 @@


def map_name(u):
"""Map a string.

Args:
u (''str''): string to convert.
Return:
Converted string.
"""
for code, patterns in mappings:
for pattern in patterns:
u = u.replace(pattern.decode("utf-8"), unichr(code))

# Workaround for special chars who don't work with py2.
if six.PY2:
u = u.replace(pattern.decode("utf-8"), unichr(code))
elif six.PY3:
u = u.replace(six.u(pattern), six.unichr(code))
return u
36 changes: 0 additions & 36 deletions sexmachine/test/test_detector.py

This file was deleted.