[AIR] Change FeatureHasher input schema to expect token counts (#27523)

This makes FeatureHasher work more like sklearn's FeatureHasher.
This commit is contained in:
Balaji Veeramani 2022-08-08 11:41:57 -07:00 committed by GitHub
parent f6328f46a3
commit 5087511c46
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 82 additions and 42 deletions

View file

@ -9,19 +9,68 @@ from ray.data.preprocessors.utils import simple_hash
class FeatureHasher(Preprocessor):
"""Hash the features of the specified columns.
"""Apply the `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_ to a
table that describes token frequencies.
The created columns will have names in the format ``hash_{column_names}_{hash}``,
e.g. ``hash_column1_column2_0``, ``hash_column1_column2_1``, ...
:class:`FeatureHasher` creates ``num_features`` columns named ``hash_{index}``,
where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column
``hash_{index}`` describes the frequency of tokens that hash to ``index``.
Note: Currently sparse matrices are not supported.
Therefore, it is recommended to **not** use a large ``num_features``.
Distinct tokens can correspond to the same index. However, if ``num_features`` is
large enough, then columns probably correspond to a unique token.
This preprocessor is memory efficient and quick to pickle. However, given a
transformed column, you can't know which tokens correspond to it. This might make it
hard to determine which tokens are important to your model.
.. warning::
Sparse matrices aren't supported. If you use a large ``num_features``, this
preprocessor might behave poorly.
Args:
columns: The columns of features that should be projected
onto a single hashed feature vector.
num_features: The size of the hashed feature vector.
"""
columns: The columns to apply the hashing trick to. Each column should describe
the frequency of a token.
num_features: The number of features used to represent the vocabulary. You
should choose a value large enough to prevent hash collisions between
distinct tokens.
Examples:
>>> import pandas as pd
>>> import ray
>>> from ray.data.preprocessors import FeatureHasher
The data below describes the frequencies of tokens in ``"I like Python"`` and
``"I dislike Python"``.
>>> df = pd.DataFrame({
... "I": [1, 1],
... "like": [1, 0],
... "dislike": [0, 1],
... "Python": [1, 1]
... })
>>> ds = ray.data.from_pandas(df) # doctest: +SKIP
:class:`FeatureHasher` hashes each token to determine its index. For example,
the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.
>>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
>>> hasher.fit_transform(ds).to_pandas().to_numpy() # doctest: +SKIP
array([[0, 0, 0, 2, 0, 1, 0, 0],
[0, 0, 0, 1, 0, 1, 1, 0]])
Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index
:math:`3`. You can avoid hash collisions like these by increasing
``num_features``.
.. seealso::
:class:`~ray.data.preprocessors.CountVectorizer`
Use this preprocessor to generate inputs for :class:`FeatureHasher`.
:class:`ray.data.preprocessors.HashingVectorizer`
If your input data describes documents rather than token frequencies,
use :class:`~ray.data.preprocessors.HashingVectorizer`.
""" # noqa: E501
_is_fittable = False
@ -33,17 +82,12 @@ class FeatureHasher(Preprocessor):
def _transform_pandas(self, df: pd.DataFrame):
# TODO(matt): Use sparse matrix for efficiency.
joined_columns = "_".join(self.columns)
def row_feature_hasher(row):
hash_counts = collections.defaultdict(int)
for column in self.columns:
hashed_value = simple_hash(row[column], self.num_features)
hash_counts[hashed_value] = hash_counts[hashed_value] + 1
return {
f"hash_{joined_columns}_{i}": hash_counts[i]
for i in range(self.num_features)
}
hashed_value = simple_hash(column, self.num_features)
hash_counts[hashed_value] += row[column]
return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)}
feature_columns = df.loc[:, self.columns].apply(
row_feature_hasher, axis=1, result_type="expand"

View file

@ -1302,33 +1302,29 @@ def test_tokenizer():
def test_feature_hasher():
"""Tests basic FeatureHasher functionality."""
col_a = [0, "a", "b"]
col_b = [0, "a", "c"]
in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
ds = ray.data.from_pandas(in_df)
hasher = FeatureHasher(["A", "B"], num_features=5)
transformed = hasher.transform(ds)
out_df = transformed.to_pandas()
processed_col_0 = [0, 0, 1]
processed_col_1 = [0, 0, 1]
processed_col_2 = [0, 2, 0]
processed_col_3 = [2, 0, 0]
processed_col_4 = [0, 0, 0]
expected_df = pd.DataFrame.from_dict(
{
"hash_A_B_0": processed_col_0,
"hash_A_B_1": processed_col_1,
"hash_A_B_2": processed_col_2,
"hash_A_B_3": processed_col_3,
"hash_A_B_4": processed_col_4,
}
# This dataframe represents the counts from the documents "I like Python" and "I
# dislike Python".
token_counts = pd.DataFrame(
{"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]}
)
assert out_df.equals(expected_df)
hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256)
document_term_matrix = hasher.fit_transform(
ray.data.from_pandas(token_counts)
).to_pandas()
# Document-term matrix should have shape (# documents, # features)
assert document_term_matrix.shape == (2, 256)
# The tokens tokens "I", "like", and "Python" should be hashed to distinct indices
# for adequately large `num_features`.
assert document_term_matrix.iloc[0].sum() == 3
assert all(document_term_matrix.iloc[0] <= 1)
# The tokens tokens "I", "dislike", and "Python" should be hashed to distinct
# indices for adequately large `num_features`.
assert document_term_matrix.iloc[1].sum() == 3
assert all(document_term_matrix.iloc[1] <= 1)
def test_hashing_vectorizer():