mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[AIR] Change FeatureHasher
input schema to expect token counts (#27523)
This makes FeatureHasher work more like sklearn's FeatureHasher.
This commit is contained in:
parent
f6328f46a3
commit
5087511c46
2 changed files with 82 additions and 42 deletions
|
@ -9,19 +9,68 @@ from ray.data.preprocessors.utils import simple_hash
|
|||
|
||||
|
||||
class FeatureHasher(Preprocessor):
|
||||
"""Hash the features of the specified columns.
|
||||
"""Apply the `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_ to a
|
||||
table that describes token frequencies.
|
||||
|
||||
The created columns will have names in the format ``hash_{column_names}_{hash}``,
|
||||
e.g. ``hash_column1_column2_0``, ``hash_column1_column2_1``, ...
|
||||
:class:`FeatureHasher` creates ``num_features`` columns named ``hash_{index}``,
|
||||
where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column
|
||||
``hash_{index}`` describes the frequency of tokens that hash to ``index``.
|
||||
|
||||
Note: Currently sparse matrices are not supported.
|
||||
Therefore, it is recommended to **not** use a large ``num_features``.
|
||||
Distinct tokens can correspond to the same index. However, if ``num_features`` is
|
||||
large enough, then columns probably correspond to a unique token.
|
||||
|
||||
This preprocessor is memory efficient and quick to pickle. However, given a
|
||||
transformed column, you can't know which tokens correspond to it. This might make it
|
||||
hard to determine which tokens are important to your model.
|
||||
|
||||
.. warning::
|
||||
Sparse matrices aren't supported. If you use a large ``num_features``, this
|
||||
preprocessor might behave poorly.
|
||||
|
||||
Args:
|
||||
columns: The columns of features that should be projected
|
||||
onto a single hashed feature vector.
|
||||
num_features: The size of the hashed feature vector.
|
||||
"""
|
||||
columns: The columns to apply the hashing trick to. Each column should describe
|
||||
the frequency of a token.
|
||||
num_features: The number of features used to represent the vocabulary. You
|
||||
should choose a value large enough to prevent hash collisions between
|
||||
distinct tokens.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> import pandas as pd
|
||||
>>> import ray
|
||||
>>> from ray.data.preprocessors import FeatureHasher
|
||||
|
||||
The data below describes the frequencies of tokens in ``"I like Python"`` and
|
||||
``"I dislike Python"``.
|
||||
|
||||
>>> df = pd.DataFrame({
|
||||
... "I": [1, 1],
|
||||
... "like": [1, 0],
|
||||
... "dislike": [0, 1],
|
||||
... "Python": [1, 1]
|
||||
... })
|
||||
>>> ds = ray.data.from_pandas(df) # doctest: +SKIP
|
||||
|
||||
:class:`FeatureHasher` hashes each token to determine its index. For example,
|
||||
the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.
|
||||
|
||||
>>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8)
|
||||
>>> hasher.fit_transform(ds).to_pandas().to_numpy() # doctest: +SKIP
|
||||
array([[0, 0, 0, 2, 0, 1, 0, 0],
|
||||
[0, 0, 0, 1, 0, 1, 1, 0]])
|
||||
|
||||
Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index
|
||||
:math:`3`. You can avoid hash collisions like these by increasing
|
||||
``num_features``.
|
||||
|
||||
.. seealso::
|
||||
:class:`~ray.data.preprocessors.CountVectorizer`
|
||||
Use this preprocessor to generate inputs for :class:`FeatureHasher`.
|
||||
|
||||
:class:`ray.data.preprocessors.HashingVectorizer`
|
||||
If your input data describes documents rather than token frequencies,
|
||||
use :class:`~ray.data.preprocessors.HashingVectorizer`.
|
||||
""" # noqa: E501
|
||||
|
||||
_is_fittable = False
|
||||
|
||||
|
@ -33,17 +82,12 @@ class FeatureHasher(Preprocessor):
|
|||
|
||||
def _transform_pandas(self, df: pd.DataFrame):
|
||||
# TODO(matt): Use sparse matrix for efficiency.
|
||||
joined_columns = "_".join(self.columns)
|
||||
|
||||
def row_feature_hasher(row):
|
||||
hash_counts = collections.defaultdict(int)
|
||||
for column in self.columns:
|
||||
hashed_value = simple_hash(row[column], self.num_features)
|
||||
hash_counts[hashed_value] = hash_counts[hashed_value] + 1
|
||||
return {
|
||||
f"hash_{joined_columns}_{i}": hash_counts[i]
|
||||
for i in range(self.num_features)
|
||||
}
|
||||
hashed_value = simple_hash(column, self.num_features)
|
||||
hash_counts[hashed_value] += row[column]
|
||||
return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)}
|
||||
|
||||
feature_columns = df.loc[:, self.columns].apply(
|
||||
row_feature_hasher, axis=1, result_type="expand"
|
||||
|
|
|
@ -1302,33 +1302,29 @@ def test_tokenizer():
|
|||
|
||||
def test_feature_hasher():
|
||||
"""Tests basic FeatureHasher functionality."""
|
||||
|
||||
col_a = [0, "a", "b"]
|
||||
col_b = [0, "a", "c"]
|
||||
in_df = pd.DataFrame.from_dict({"A": col_a, "B": col_b})
|
||||
ds = ray.data.from_pandas(in_df)
|
||||
|
||||
hasher = FeatureHasher(["A", "B"], num_features=5)
|
||||
transformed = hasher.transform(ds)
|
||||
out_df = transformed.to_pandas()
|
||||
|
||||
processed_col_0 = [0, 0, 1]
|
||||
processed_col_1 = [0, 0, 1]
|
||||
processed_col_2 = [0, 2, 0]
|
||||
processed_col_3 = [2, 0, 0]
|
||||
processed_col_4 = [0, 0, 0]
|
||||
|
||||
expected_df = pd.DataFrame.from_dict(
|
||||
{
|
||||
"hash_A_B_0": processed_col_0,
|
||||
"hash_A_B_1": processed_col_1,
|
||||
"hash_A_B_2": processed_col_2,
|
||||
"hash_A_B_3": processed_col_3,
|
||||
"hash_A_B_4": processed_col_4,
|
||||
}
|
||||
# This dataframe represents the counts from the documents "I like Python" and "I
|
||||
# dislike Python".
|
||||
token_counts = pd.DataFrame(
|
||||
{"I": [1, 1], "like": [1, 0], "dislike": [0, 1], "Python": [1, 1]}
|
||||
)
|
||||
|
||||
assert out_df.equals(expected_df)
|
||||
hasher = FeatureHasher(["I", "like", "dislike", "Python"], num_features=256)
|
||||
document_term_matrix = hasher.fit_transform(
|
||||
ray.data.from_pandas(token_counts)
|
||||
).to_pandas()
|
||||
|
||||
# Document-term matrix should have shape (# documents, # features)
|
||||
assert document_term_matrix.shape == (2, 256)
|
||||
|
||||
# The tokens tokens "I", "like", and "Python" should be hashed to distinct indices
|
||||
# for adequately large `num_features`.
|
||||
assert document_term_matrix.iloc[0].sum() == 3
|
||||
assert all(document_term_matrix.iloc[0] <= 1)
|
||||
|
||||
# The tokens tokens "I", "dislike", and "Python" should be hashed to distinct
|
||||
# indices for adequately large `num_features`.
|
||||
assert document_term_matrix.iloc[1].sum() == 3
|
||||
assert all(document_term_matrix.iloc[1] <= 1)
|
||||
|
||||
|
||||
def test_hashing_vectorizer():
|
||||
|
|
Loading…
Add table
Reference in a new issue