Index

Documentation

fns is a collection of python functions re-usable across ML projects.

Installation

pip install fns

Prototyping helpers

Import most common python stdlib and data science functions into current session.

from fns.all import *

Functions

`array_except_element(arr, elem)`

Get copy of array without an element.

Parameters:

Name	Type	Description	Default
`arr`	`List`		required
`elem`	`Any`		required

Returns:

Type	Description
`List`	Array

Examples:

>>> array_except_element([1, 2, 3], 3)
[1, 2]

Source code in fns/fns.py

def array_except_element(arr: List, elem: Any) -> List:
    """
    Get copy of array without an element.

    Args:
        arr:
        elem:

    Returns:
        Array

    Example:
    ```python
    >>> array_except_element([1, 2, 3], 3)
    [1, 2]
    ```
    """
    elem_index = arr.index(elem)
    return arr[:elem_index] + arr[elem_index + 1 :]

`base64_dict(base64_str)`

Parse a base64-encoded JSON as dictionary.

Parameters:

Name	Type	Description	Default
`base64_str`	`str`	Base-64 encoded string representation of JSON	required

Returns:

Type	Description
`Dict`	Python Dictionary

Source code in fns/fns.py

def base64_dict(base64_str: str) -> Dict:
    """
    Parse a base64-encoded JSON as dictionary.

    Args:
        base64_str: Base-64 encoded string representation of JSON

    Returns:
        Python Dictionary
    """
    return json.loads(base64.b64decode(base64_str))

`flatten(x)`

Flatten a list of list.

Parameters:

Name	Type	Description	Default
`x`	`List[List]`	List of list of elements	required

Returns:

Type	Description
`Iterator`	Iterator of flattened array.

Source code in fns/fns.py

def flatten(x: List[List]) -> Iterator:
    """
    Flatten a list of list.

    Args:
        x: List of list of elements

    Returns:
        Iterator of flattened array.
    """
    return itertools.chain.from_iterable(x)

`format_as_hms(seconds)`

Convert seconds to HH:MM:SS format.

Parameters:

Name	Type	Description	Default
`seconds`	`Union[int, float]`	Number of seconds	required

Returns:

Type	Description
`str`	String in the format HH:MM:SS

Source code in fns/fns.py

def format_as_hms(seconds: Union[int, float]) -> str:
    """
    Convert seconds to HH:MM:SS format.

    Args:
        seconds: Number of seconds

    Returns:
        String in the format HH:MM:SS
    """
    return time.strftime("%H:%M:%S", time.gmtime(seconds))

`generate_edits(word, n=1)`

Generate variations that are n edits away from word.

Adapted from: https://norvig.com/spell-correct.html

Parameters:

Name	Type	Description	Default
`word`	`str`	Single word	required
`n`	`int`	Number of edits away from word.	`1`

Returns:

Type	Description
`List[str]`	List of edits

Source code in fns/fns.py

def generate_edits(word: str, n: int = 1) -> List[str]:
    """
    Generate variations that are `n` edits away from word.

    Adapted from: https://norvig.com/spell-correct.html

    Args:
        word: Single word
        n: Number of edits away from word.

    Returns:
        List of edits
    """

    def edits1(word: str):
        letters = "abcdefghijklmnopqrstuvwxyz"
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    edits = edits1(word)
    for i in range(n - 1):
        edits = [e2 for e1 in edits for e2 in edits1(e1)]
    return edits

`harmonic_mean(a, b)`

Compute harmonic mean of two numbers.

Parameters:

Name	Type	Description	Default
`a`	`Union[int, float]`	First number	required
`b`	`Union[int, float]`	Second number	required

Returns:

Type	Description
`Union[int, float]`	Harmonic mean

Source code in fns/fns.py

def harmonic_mean(a: Union[int, float], b: Union[int, float]) -> Union[int, float]:
    """
    Compute harmonic mean of two numbers.

    Args:
        a: First number
        b: Second number

    Returns:
        Harmonic mean
    """
    return (2 * a * b) / (a + b)

`hash_file(file_object)`

Calculate MD5 hash of file.

Parameters:

Name	Type	Description	Default
`file_object`	`IO`	File object	required

Returns:

Type	Description
	MD5 hash of the file

Source code in fns/fns.py

def hash_file(file_object: IO):
    """
    Calculate MD5 hash of file.

    Args:
        file_object: File object

    Returns:
        MD5 hash of the file
    """
    # Calculate hash
    unique_hash = md5_hash(file_object.read())

    # Reset file pointer to start
    file_object.seek(0)

    return unique_hash

`minibatch(items, size)`

Create mini-batches of length 'size' from a list of items.

Original Source: spacy package

Original function definition: https://github.com/explosion/spaCy/blob/master/spacy/util.py#L1426

Source code in fns/fns.py

def minibatch(items, size):
    """
    Create mini-batches of length 'size' from a list of items.

    Original Source: `spacy` package

    Original function definition:
    https://github.com/explosion/spaCy/blob/master/spacy/util.py#L1426
    """
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    else:
        size_ = size
    items = iter(items)
    while True:
        batch_size = next(size_)
        batch = list(itertools.islice(items, int(batch_size)))
        if len(batch) == 0:
            break
        yield list(batch)

`ngrams(tokens, n)`

Parameters:

Name	Type	Description	Default
`tokens`	`List`	List of elements	required
`n`	`int`	N-gram size	required

Returns:

Type	Description
	List of ngrams

Source code in fns/fns.py

def ngrams(tokens: List, n: int):
    """

    Args:
        tokens: List of elements
        n: N-gram size

    Returns:
        List of ngrams
    """
    return [tokens[i : i + n] for i in range(len(tokens) - n + 1)]

`num_files(path)`

Get the number of files in a path.

Parameters:

Name	Type	Description	Default
`path`	`Union[pathlib.Path, str]`	File path	required

Returns:

Type	Description
`int`	Number of files

Source code in fns/fns.py

def num_files(path: Union[Path, str]) -> int:
    """
    Get the number of files in a path.

    Args:
        path: File path

    Returns:
        Number of files
    """
    return len(os.listdir(path))

`parse_manual(parser, command)`

Use argument parser in notebooks.

Parameters:

Name	Type	Description	Default
`parser`	`ArgumentParser`	ArgumentParser	required
`command`	`str`	Command line arguments as string	required

Returns:

Type	Description
`Namespace`	Parsed argument as namespace

Source code in fns/fns.py

def parse_manual(parser: argparse.ArgumentParser, command: str) -> argparse.Namespace:
    """
    Use argument parser in notebooks.

    Args:
        parser: ArgumentParser
        command: Command line arguments as string

    Returns:
        Parsed argument as namespace
    """
    args = command.split()
    return parser.parse_args(args=args)

`percent_dict(d)`

Convert a dictionary of key-value to key:coverage-percent.

Parameters:

Name	Type	Description	Default
`d`	`Dict`	Dictionary of key and values	required

Returns:

Type	Description
`Dict`	Dictionary of key and percent-coverage

Source code in fns/fns.py

def percent_dict(d: Dict) -> Dict:
    """
    Convert a dictionary of key-value to key:coverage-percent.

    Args:
        d: Dictionary of key and values

    Returns:
        Dictionary of key and percent-coverage
    """
    total = sum(d.values())
    return {key: value / total * 100.0 for key, value in d.items()}

`print_json(d)`

Render python dictionary as JSON with double quotes and indentation.

Parameters:

Name	Type	Description	Default
`d`	`Dict`	Python dictionary	required

Returns:

Type	Description
`None`	None

Source code in fns/fns.py

def print_json(d: Dict) -> None:
    """
    Render python dictionary as JSON with double quotes and indentation.

    Args:
        d: Python dictionary

    Returns:
        None
    """
    print(json.dumps(d, indent=4))

`read_as_base64(path)`

Convert file contents into a base64 string

Parameters:

Name	Type	Description	Default
`path`	`Union[str, pathlib.Path]`	File path	required

Returns:

Type	Description
`str`	Base64 string

Source code in fns/fns.py

def read_as_base64(path: Union[str, Path]) -> str:
    """
    Convert file contents into a base64 string

    Args:
        path: File path

    Returns:
        Base64 string
    """
    content = Path(path).read_text()
    return base64.b64encode(content.encode("utf-8")).decode("utf-8")

`read_json(json_path)`

Read json file from a path.

Parameters:

Name	Type	Description	Default
`json_path`	`Union[str, pathlib.Path]`	File path to a json file.	required

Returns:

Type	Description
`Dict`	Python dictionary

Source code in fns/fns.py

def read_json(json_path: Union[str, Path]) -> Dict:
    """
    Read json file from a path.

    Args:
        json_path: File path to a json file.

    Returns:
        Python dictionary
    """
    with open(json_path, "r") as fp:
        data = json.load(fp)
    return data

`read_pickle(path)`

Read a pickle file from path.

Parameters:

Name	Type	Description	Default
`path`	`Union[str, pathlib.Path]`	File path	required

Returns:

Type	Description
`Any`	Unpickled object

Source code in fns/fns.py

def read_pickle(path: Union[str, Path]) -> Any:
    """
    Read a pickle file from path.

    Args:
        path: File path

    Returns:
        Unpickled object
    """
    with open(path, "rb") as fp:
        return pickle.load(fp)

`reverse_mapping(d)`

Swap mapping from key: value to value: key

Parameters:

Name	Type	Description	Default
`d`	`Dict`	Python Dictionary	required

Returns:

Type	Description
`Dict`	Dictionary with key and value swapped

Source code in fns/fns.py

def reverse_mapping(d: Dict) -> Dict:
    """
    Swap mapping from key: value to value: key

    Args:
        d: Python Dictionary

    Returns:
        Dictionary with key and value swapped
    """
    return {v: k for k, v in d.items()}

`roundup(n, m=10)`

Round up a number n to the nearest multiple of M.

Parameters:

Name	Type	Description	Default
`n`	`float`	Number	required
`m`	`int`	Multiple of which number to roundup to	`10`

Returns:

Type	Description
`int`	Rounded integer number

Source code in fns/fns.py

def roundup(n: float, m: int = 10) -> int:
    """
    Round up a number n to the nearest multiple of M.

    Args:
        n: Number
        m: Multiple of which number to roundup to

    Returns:
        Rounded integer number
    """
    return int(math.ceil(n / m)) * m

`sort_dict_by_value(d, reverse=False)`

Sort items in dictionary by value.

Examples:

>>> sort_dict_by_value({'gold': 40, 'silver': 25})
{'silver': 25, 'gold': 40}

Parameters:

Name	Type	Description	Default
`d`	`Dict`	Python Dictionary	required
`reverse`	`bool`	Sort order	`False`

Returns:

Type	Description
`Dict`	Sorted dictionary

Source code in fns/fns.py

def sort_dict_by_value(d: Dict, reverse: bool = False) -> Dict:
    """
    Sort items in dictionary by value.

    Example:
    ```python
    >>> sort_dict_by_value({'gold': 40, 'silver': 25})
    {'silver': 25, 'gold': 40}
    ```

    Args:
        d: Python Dictionary
        reverse: Sort order

    Returns:
        Sorted dictionary
    """
    return dict(sorted(d.items(), key=lambda item: item[1], reverse=reverse))

`top(data, n=5)`

Get a dictionary of top-n items from a list.

Parameters:

Name	Type	Description	Default
`data`		Python collection	required
`n`	`int`	Number of top-values	`5`

Returns:

Type	Description
`Dict`	Dictionary of top-n items and count

Source code in fns/fns.py

def top(data, n: int = 5) -> Dict:
    """
    Get a dictionary of top-n items from a list.

    Args:
        data: Python collection
        n: Number of top-values

    Returns:
        Dictionary of top-n items and count
    """
    return dict(Counter(data).most_common(n))

`top_n_from_dict(dictionary, n=10)`

Get top n largest values from the dictionary.

Parameters:

Name	Type	Description	Default
`dictionary`	`Dict`	Python dictionary	required
`n`	`int`	Number of keys to pick	`10`

Source code in fns/fns.py

def top_n_from_dict(dictionary: Dict, n: int = 10):
    """
    Get top n largest values from the dictionary.

    Args:
        dictionary: Python dictionary
        n: Number of keys to pick

    Returns:

    """
    return top(dictionary, n=n)

`write_json(item, path, mode='w')`

Save json to a file.

Parameters:

Name	Type	Description	Default
`item`	`Dict`	Python dictionary	required
`path`	`Union[pathlib.Path, str]`	File path to save at	required
`mode`	`str`	File write mode	`'w'`

Returns:

Type	Description
`None`	None

Source code in fns/fns.py

def write_json(item: Dict, path: Union[Path, str], mode: str = "w") -> None:
    """
    Save json to a file.

    Args:
        item: Python dictionary
        path: File path to save at
        mode: File write mode

    Returns:
        None
    """
    with open(path, mode=mode) as fp:
        json.dump(item, fp)

`write_pickle(item, path)`

Pickle a python object.

Parameters:

Name	Type	Description	Default
`item`	`Any`	Python object	required
`path`	`Union[pathlib.Path, str]`	File path to save the pickle file	required

Returns:

Type	Description
`None`	None

Source code in fns/fns.py

def write_pickle(item: Any, path: Union[Path, str]) -> None:
    """
    Pickle a python object.

    Args:
        item: Python object
        path: File path to save the pickle file

    Returns:
        None
    """
    with open(path, "wb") as fp:
        pickle.dump(item, fp)

Functions

`cluster_text(texts, return_dataframe=True, n=None)`

Quickly cluster a list of sentences for EDA.

Parameters:

Name	Type	Description	Default
`n`	`int`	Number of clusters	`None`
`texts`	`List[str]`	List of sentences	required
`return_dataframe`	`bool`	Whether to return as dataframe or a list of cluster labels	`True`

Returns:

Type	Description
`Union[pandas.core.frame.DataFrame, List[int]]`

Source code in fns/cluster.py

def cluster_text(
    texts: List[str], return_dataframe: bool = True, n: int = None
) -> Union[pd.DataFrame, List[int]]:
    """
    Quickly cluster a list of sentences for EDA.

    Args:
        n: Number of clusters
        texts: List of sentences
        return_dataframe: Whether to return as dataframe or a list of cluster labels

    Returns:

    """
    n = n or n_clusters(texts)
    hybrid_tfidf = FeatureUnion(
        [
            (
                "word_tfidf",
                TfidfVectorizer(
                    ngram_range=(1, 2),
                    strip_accents="unicode",
                    analyzer="word",
                    stop_words="english",
                    sublinear_tf=True,
                ),
            ),
            (
                "char_tfidf",
                TfidfVectorizer(
                    ngram_range=(3, 3),
                    strip_accents="unicode",
                    analyzer="char_wb",
                    stop_words="english",
                    sublinear_tf=True,
                ),
            ),
        ]
    )
    cluster_pipeline = make_pipeline(
        hybrid_tfidf,
        FunctionTransformer(lambda x: x.todense(), accept_sparse=True, validate=False),
        PCA(0.9, random_state=0),
        KMeans(n, random_state=0),
    )
    clusters = cluster_pipeline.fit_predict(texts)
    if return_dataframe:
        return (
            pd.DataFrame({"text": texts, "cluster": clusters})
            .assign(
                cluster_size=lambda d: d["cluster"].map(d["cluster"].value_counts())
            )
            .sort_values(
                by=["cluster_size", "cluster", "text"], ascending=[False, True, True]
            )
            .drop(columns=["cluster_size"])
        )
    else:
        return clusters

`similarity_sort(texts)`

Sort list of sentences such that similar sentence are placed consecutively.

Parameters:

Name	Type	Description	Default
`texts`	`List[str]`	List of sentences	required

Returns:

Type	Description
`List[str]`	Sorted sentences

Source code in fns/cluster.py

def similarity_sort(texts: List[str]) -> List[str]:
    """
    Sort list of sentences such that similar sentence are placed consecutively.

    Args:
        texts: List of sentences

    Returns:
        Sorted sentences
    """
    df = cluster_text(texts, n=len(texts) // 2)
    return df["text"].tolist()

Functions

`expose_port(port, path='/')`

Expose port as an external URL.

The URL is only accessible to you and available till the notebook runs.

Parameters:

Name	Type	Description	Default
`port`	`int`	Port a service is running on	required
`path`	`str`	Path the service is running on	`'/'`

Returns:

Type	Description
`None`	None

Source code in fns/colab.py

def expose_port(port: int, path: str = "/") -> None:
    """
    Expose port as an external URL.

    The URL is only accessible to you and available till the notebook runs.

    Args:
        port: Port a service is running on
        path: Path the service is running on

    Returns:
        None
    """
    output = import_module("google.colab.output")
    output.serve_kernel_port_as_window(port, path=path)

`jupyter(subdomain, port=9003)`

Start a jupyter notebook server using localtunnel.

Returns:

Type	Description
`None`	None

Source code in fns/colab.py

def jupyter(subdomain: str, port: int = 9003) -> None:
    """
    Start a jupyter notebook server using localtunnel.

    Returns:
        None
    """
    command = f"jupyter-notebook --ip='*' --no-browser --allow-root --port 9003 & npx localtunnel -p {port} -s {subdomain} --allow-invalid-cert"
    run_foreground(command)

`run_background(command)`

Run a bash command in background.

Parameters:

Name	Type	Description	Default
`command`	`str`	Bash command	required

Returns:

Type	Description
`None`	None

Source code in fns/colab.py

def run_background(command: str) -> None:
    """
    Run a bash command in background.

    Args:
        command: Bash command

    Returns:
        None
    """
    subprocess.Popen(command, shell=True)

`run_foreground(cmd)`

Run a bash command in foreground.

Reference: http://blog.kagesenshi.org/2008/02/teeing-python-subprocesspopen-output.html

Parameters:

Name	Type	Description	Default
`cmd`	`str`	Bash command	required

Returns:

Type	Description
`None`	None

Source code in fns/colab.py

def run_foreground(cmd: str) -> None:
    """
    Run a bash command in foreground.

    Reference: http://blog.kagesenshi.org/2008/02/teeing-python-subprocesspopen-output.html

    Args:
        cmd: Bash command

    Returns:
        None
    """
    p = subprocess.Popen(
        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
    )
    while True:
        line = p.stdout.readline()
        print(line.strip())
        if line == "" and p.poll() is not None:
            break
    return None

`vscode(subdomain='amitness', port=9000, config_save_path='/content/drive/MyDrive/colab/.vscode')`

Start VSCode server which persists all settings and extensions.

Parameters:

Name	Type	Description	Default
`subdomain`	`str`	Subdomain for localtunnel.	`'amitness'`
`port`	`int`	Port for running code-server	`9000`
`config_save_path`	`str`	Path in Google Drive to save VSCode settings	`'/content/drive/MyDrive/colab/.vscode'`

Returns:

Type	Description
`None`	None

Source code in fns/colab.py

def vscode(
    subdomain: str = "amitness",
    port: int = 9000,
    config_save_path: str = "/content/drive/MyDrive/colab/.vscode",
) -> None:
    """
    Start VSCode server which persists all settings and extensions.

    Args:
        subdomain: Subdomain for localtunnel.
        port: Port for running code-server
        config_save_path: Path in Google Drive to save VSCode settings

    Returns:
        None
    """
    drive = import_module("google.colab.drive")
    drive.mount("/content/drive")
    subprocess.run(["curl", "-fsSL", "https://code-server.dev/install.sh", "-O"])
    subprocess.run(["bash", "install.sh", "--version", "3.10.2"])
    subprocess.run(["pip3", "install", "flake8", "--user"])
    subprocess.run(["pip3", "install", "black", "--user"])
    print(f"https://{subdomain}.loca.lt/?folder=/content/drive/MyDrive/colab")
    run_foreground(
        f"code-server --port {port} --auth none --disable-telemetry --force --user-data-dir {config_save_path} & npx localtunnel -p {port} -s {subdomain} --allow-invalid-cert"
    )

Functions

`display_all()`

Show all the rows and columns when printing dataframe.

Returns:

Type	Description
`None`	None

Source code in fns/dataframe.py

def display_all() -> None:
    """
    Show all the rows and columns when printing dataframe.

    Returns:
        None
    """
    import pandas as pd

    pd.set_option("display.max_colwidth", None)
    pd.set_option("display.max_rows", None)

`explore_df(df)`

Perform a quick peek of a dataframe.

Currently shows: - Number of null elements in each column - Data type of each column - One example data for each column

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Pandas DataFrame	required

Returns:

Type	Description
`DataFrame`	DataFrame with summary infos

Source code in fns/dataframe.py

def explore_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Perform a quick peek of a dataframe.

    Currently shows:
    - Number of null elements in each column
    - Data type of each column
    - One example data for each column

    Args:
        df: Pandas DataFrame

    Returns:
        DataFrame with summary infos
    """
    null_df = pd.DataFrame(df.isnull().sum(), columns=["num_nulls"])
    dtype_df = pd.DataFrame(df.dtypes, columns=["dtype"])
    return df.T.sample(1, axis=1).join([dtype_df, null_df]).rename_axis("Columns")

`fake_df()`

Generate a dataframe filled with random data.

Returns:

Type	Description
`DataFrame`	Pandas DataFrame

Source code in fns/dataframe.py

def fake_df() -> pd.DataFrame:
    """
    Generate a dataframe filled with random data.

    Returns:
        Pandas DataFrame
    """
    return pd.util.testing.makeDataFrame()

`is_outlier(values)`

Generate a mask if an element is an outlier or not.

Extra:

Condition 1: < Q1 - 1.5 * IQR
Condition 2: > Q3 + 1.5 * IQR

Parameters:

Name	Type	Description	Default
`values`	`List`	List of numerical values	required

Returns:

Type	Description
`List[bool]`	List of boolean indicating if an element is outlier or not

Source code in fns/dataframe.py

def is_outlier(values: List) -> List[bool]:
    """
    Generate a mask if an element is an outlier or not.

    Extra:
    ```
    Condition 1: < Q1 - 1.5 * IQR
    Condition 2: > Q3 + 1.5 * IQR
    ```

    Args:
        values: List of numerical values

    Returns:
        List of boolean indicating if an element is outlier or not
    """
    q1 = np.quantile(values, 0.25)
    q3 = np.quantile(values, 0.75)
    iqr = q3 - q1
    lower_threshold = q1 - 1.5 * iqr
    upper_threshold = q3 + 1.5 * iqr
    return (values < lower_threshold) | (values > upper_threshold)

`no_wrapping()`

Return a context manager to display all rows and columns.

Examples:

with no_wrapping():
    print(df)

Returns:

Type	Description
	Context Manager

Source code in fns/dataframe.py

def no_wrapping():
    """
    Return a context manager to display all rows and columns.

    Examples:
    ```python
    with no_wrapping():
        print(df)
    ```

    Returns:
        Context Manager
    """
    return pd.option_context("display.max_rows", None, "display.max_columns", None)

`print_groups(df, column)`

Pretty print all subsets of a groupby.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Pandas DataFrame	required
`column`	`str`	Column Name to group by	required

Returns:

Type	Description
`None`	None

Source code in fns/dataframe.py

def print_groups(df: pd.DataFrame, column: str) -> None:
    """
    Pretty print all subsets of a groupby.

    Args:
        df: Pandas DataFrame
        column: Column Name to group by

    Returns:
        None
    """
    for current_group, sub_df in df.groupby(column):
        print(f"Group: {current_group}")
        print()
        # Skip group column
        mask = ~(sub_df.columns.isin([column]))
        print(sub_df.loc[:, mask])
        print()
        print("---" * 25)

`read_dict(data)`

Create a dataframe from dictionary with unequal elements.

Parameters:

Name	Type	Description	Default
`data`	`Dict`	Dictionary with column names as keys and rows as values	required

Returns:

Type	Description
`DataFrame`	Pandas DataFrame

Source code in fns/dataframe.py

def read_dict(data: Dict) -> pd.DataFrame:
    """
    Create a dataframe from dictionary with unequal elements.

    Args:
        data: Dictionary with column names as keys and rows as values

    Returns:
        Pandas DataFrame
    """
    return pd.DataFrame.from_dict(data, orient="index").transpose()

`to_excel(path, df, sheet_name, index=False, mode='a')`

Add a dataframe to an existing Excel file.

Parameters:

Name	Type	Description	Default
`path`	`Union[pathlib.Path, str]`	Path of the excel file	required
`df`	`DataFrame`	Pandas DataFrame	required
`sheet_name`	`str`	The sheet name to save in	required
`index`	`bool`	Keep or remove index	`False`
`mode`	`str`	'a' for append or 'w' for write	`'a'`

Returns:

Type	Description
`None`	None

Source code in fns/dataframe.py

def to_excel(
    path: Union[Path, str],
    df: pd.DataFrame,
    sheet_name: str,
    index: bool = False,
    mode: str = "a",
) -> None:
    """
    Add a dataframe to an existing Excel file.

    Args:
        path: Path of the excel file
        df: Pandas DataFrame
        sheet_name: The sheet name to save in
        index: Keep or remove index
        mode: 'a' for append or 'w' for write

    Returns:
        None
    """
    with pd.ExcelWriter(path, mode=mode) as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=index)

Functions

`batched(batch_size=32)`

Apply a function over small batches of a list and combine results.

Parameters:

Name	Type	Description	Default
`batch_size`	`int`	Size of each mini-batch	`32`

Returns:

Type	Description
`Callable`	Decorator for the batch size

Source code in fns/decorators.py

def batched(batch_size: int = 32) -> Callable:
    """
    Apply a function over small batches of a list and combine results.

    Args:
        batch_size: Size of each mini-batch

    Returns:
        Decorator for the batch size
    """

    def decorator(func) -> Callable:
        @functools.wraps(func)
        def inner(*args, **kwargs):
            items = args[0]
            results = []
            for batch in minibatch(items, batch_size):
                batch_results = func(batch)
                results.extend(batch_results)
            return results

        return inner

    return decorator

`deduplicate(func)`

Decorator to deduplicate results of a function.

Usage:

@deduplicate
def test():
    return [1, 2, 3, 1]

Parameters:

Name	Type	Description	Default
`func`	`Callable`	Function	required

Returns:

Type	Description
`Callable`	Function

Source code in fns/decorators.py

def deduplicate(func: Callable) -> Callable:
    """
    Decorator to deduplicate results of a function.

    Usage:
    ```python
    @deduplicate
    def test():
        return [1, 2, 3, 1]
    ```

    Args:
        func: Function

    Returns:
        Function
    """

    @functools.wraps(func)
    def inner(*args, **kwargs):
        return list(set(func(*args, **kwargs)))

    return inner

`named_timer(func)`

Decorator to store time taken for wrapped functions.

Parameters:

Name	Type	Description	Default
`func`	`Callable`	Python Function	required

Returns:

Type	Description
`Callable`	Decorated function

Source code in fns/decorators.py

def named_timer(func: Callable) -> Callable:
    """
    Decorator to store time taken for wrapped functions.

    Args:
        func: Python Function

    Returns:
        Decorated function
    """

    @functools.wraps(func)
    def inner(*args, **kwargs):
        start_time = time.perf_counter()
        value = func(*args, **kwargs)
        name = func.__name__
        named_timer.times[name] = time.perf_counter() - start_time
        return value

    return inner

`show_shapes(func)`

Decorator to log dataframe shape before and after applying a function.

Parameters:

Name	Type	Description	Default
`func`	`Callable`	Function that takes a dataframe as argument	required

Returns:

Type	Description
`Callable`	function

Source code in fns/decorators.py

def show_shapes(func: Callable) -> Callable:
    """
    Decorator to log dataframe shape before and after applying a function.

    Args:
        func: Function that takes a dataframe as argument

    Returns:
        function
    """

    @functools.wraps(func)
    def inner(df):
        print(f"Shape before {func.__name__}", df.shape)
        out_df = func(df)
        print(f"Shape after {func.__name__}", out_df.shape)
        return out_df

    return inner

`timeit(func)`

Decorator to calculate time taken for a function to complete.

Parameters:

Name	Type	Description	Default
`func`	`Callable`	Python Function	required

Returns:

Type	Description
`Callable`	Decorated function

Source code in fns/decorators.py

def timeit(func: Callable) -> Callable:
    """
    Decorator to calculate time taken for a function to complete.

    Args:
        func: Python Function

    Returns:
        Decorated function
    """
    start_time = time.time()

    @functools.wraps(func)
    def inner(*args, **kwargs):
        func(*args, **kwargs)
        total_time_taken = time.time() - start_time
        print("Total time taken: {} seconds".format(total_time_taken))

    return inner

`to(data_type)`

Apply a data type to returned data from a function.

Parameters:

Name	Type	Description	Default
`data_type`		The data type to apply. Eg: list, int etc.	required

Returns:

Type	Description
`Callable`	Decorator that applies the data type on returned data

Source code in fns/decorators.py

def to(data_type) -> Callable:
    """
    Apply a data type to returned data from a function.

    Args:
        data_type: The data type to apply. Eg: list, int etc.

    Returns:
        Decorator that applies the data type on returned data
    """

    def decorator(func) -> Callable:
        @functools.wraps(func)
        def inner(*args, **kwargs):
            return data_type(func(*args, **kwargs))

        return inner

    return decorator

Functions

`dict_words()`

Fetch default list of words present in Linux distros.

Returns:

Type	Description
`List[str]`	List of words

Source code in fns/lexicon.py

def dict_words() -> List[str]:
    """
    Fetch default list of words present in Linux distros.

    Returns:
        List of words
    """
    return Path("/usr/share/dict/words").read_text().splitlines()

`onegram_count()`

Get counts of 1-gram from Peter Norvig's list.

Returns:

Type	Description
`DataFrame`	DataFrame with one-gram, count and idf scores.

Source code in fns/lexicon.py

@lru_cache(1)
def onegram_count() -> pd.DataFrame:
    """
    Get counts of 1-gram from Peter Norvig's list.

    Returns:
        DataFrame with one-gram, count and idf scores.
    """
    df = pd.read_csv(
        "https://norvig.com/ngrams/count_1w.txt",
        sep="\t",
        header=None,
        names=["word", "count"],
    )
    df["idf"] = np.log(df["count"].sum() / df["count"])
    df.sort_values(by="idf", ascending=True, inplace=True)
    return df

Functions

`baseline_accuracy(labels)`

Get accuracy for always majority class classifier.

Usage:

>>> baseline_accuracy([0, 1])
50.0

Parameters:

Name	Type	Description	Default
`labels`	`List`	List of class labels.	required

Returns:

Type	Description
`float`	Baseline accuracy

Source code in fns/metrics.py

def baseline_accuracy(labels: List) -> float:
    """
    Get accuracy for always majority class classifier.

    Usage:
    ```python
    >>> baseline_accuracy([0, 1])
    50.0
    ```

    Args:
        labels: List of class labels.

    Returns:
        Baseline accuracy
    """
    (label, count), *_ = Counter(labels).most_common(1)
    return count / len(labels) * 100.0

`benchmark_function(fn, repeat=5)`

Benchmark time taken for a function and return metrics.

Parameters:

Name	Type	Description	Default
`fn`	`Callable`	A python function	required
`repeat`	`int`	Number of samples	`5`

Returns:

Type	Description
`Dict`	Dictionary of total times, mean and std of times

Source code in fns/metrics.py

def benchmark_function(fn: Callable, repeat: int = 5) -> Dict:
    """
    Benchmark time taken for a function and return metrics.

    Args:
        fn: A python function
        repeat: Number of samples

    Returns:
        Dictionary of total times, mean and std of times
    """
    iteration_times = timeit.repeat(fn, repeat=repeat, number=1)
    return {
        "time": iteration_times,
        "mean": np.mean(iteration_times),
        "std": np.std(iteration_times),
    }

`clustering_report(y_true, y_pred)`

Generate cluster evaluation metrics.

Parameters:

Name	Type	Description	Default
`y_true`		Array of actual labels	required
`y_pred`		Array of predicted clusters	required

Returns:

Type	Description
`DataFrame`	Pandas DataFrame with metrics.

Source code in fns/metrics.py

def clustering_report(y_true, y_pred) -> pd.DataFrame:
    """
    Generate cluster evaluation metrics.


    Args:
        y_true: Array of actual labels
        y_pred: Array of predicted clusters

    Returns:
        Pandas DataFrame with metrics.
    """
    return pd.DataFrame(
        {
            "Homogeneity": M.homogeneity_score(y_true, y_pred),
            "Completeness": M.completeness_score(y_true, y_pred),
            "V-Measure": M.v_measure_score(y_true, y_pred),
            "Adjusted Rand Index": M.adjusted_rand_score(y_true, y_pred),
            "Adjusted Mutual Information": M.adjusted_mutual_info_score(y_true, y_pred),
        },
        index=["value"],
    ).T

`jaccard(x, y)`

Compute jaccard similarity (intersection over union).

Parameters:

Name	Type	Description	Default
`x`		Array-like object	required
`y`		Array-like object	required

Returns:

Type	Description
`float`	Intersection Over Union score

Source code in fns/metrics.py

def jaccard(x, y) -> float:
    """
    Compute jaccard similarity (intersection over union).

    Args:
        x: Array-like object
        y: Array-like object

    Returns:
        Intersection Over Union score
    """
    s1 = set(x)
    s2 = set(y)
    if len(s1) == 0 and len(s2) == 0:
        return 0
    return len(s1 & s2) / len(s1 | s2)

`missing_value_percent(df)`

Get the percentage of missing values in each column.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Pandas DataFrame	required

Returns:

Type	Description
`DataFrame`	Percentage of missing value in each column.

Source code in fns/metrics.py

def missing_value_percent(df: pd.DataFrame) -> pd.DataFrame:
    """
    Get the percentage of missing values in each column.

    Args:
        df: Pandas DataFrame

    Returns:
        Percentage of missing value in each column.
    """
    num_rows = len(df)
    return (df.isna().sum() / num_rows * 100.0).sort_values(ascending=False)

`multilabel_classification_report(y_true, y_pred)`

Compute all metrics for a multi-label classification problem.

Parameters:

Name	Type	Description	Default
`y_true`		True binarized labels	required
`y_pred`		Predicted binarized labels	required

Returns:

Type	Description
`Series`	Pandas series of metrics

Source code in fns/metrics.py

def multilabel_classification_report(y_true, y_pred) -> pd.Series:
    """
    Compute all metrics for a multi-label classification problem.
    Args:
        y_true: True binarized labels
        y_pred: Predicted binarized labels

    Returns:
        Pandas series of metrics
    """
    scores = {
        "accuracy": M.accuracy_score(y_true, y_pred),
        "precision_macro": M.precision_score(y_true, y_pred, average="macro"),
        "recall_macro": M.recall_score(y_true, y_pred, average="macro"),
        "f1_samples": M.f1_score(y_true, y_pred, average="samples"),
        "f1_macro": M.f1_score(y_true, y_pred, average="macro"),
        "f1_weighted": M.f1_score(y_true, y_pred, average="weighted"),
        "hamming_loss": M.hamming_loss(y_true, y_pred),
    }
    return pd.Series(scores)

`n_clusters(data)`

Generate number of clusters to create.

Heuristic: Number of clusters = square root of total data points

Parameters:

Name	Type	Description	Default
`data`		Total number of data points or the data point itself	required

Returns:

Type	Description
`int`	Number of clusters

Source code in fns/metrics.py

def n_clusters(data) -> int:
    """
    Generate number of clusters to create.

    Heuristic:
    Number of clusters = square root of total data points

    Args:
        data: Total number of data points or the data point itself

    Returns:
        Number of clusters
    """
    if type(data) is int:
        total_rows = data
    else:
        total_rows = len(set(data))
    return int(math.sqrt(total_rows))

`outlier_cutoff(values)`

Generate the lower and upper bound for outliers.

Extra:

Lower bound: < Q1 - 1.5 * IQR
Upper bound: > Q3 + 1.5 * IQR

Parameters:

Name	Type	Description	Default
`values`	`List`	List of numerical values	required

Returns:

Type	Description
`Tuple[float, float]`	Tuple of (lower-cutoff, upper-cutoff)

Source code in fns/metrics.py

def outlier_cutoff(values: List) -> Tuple[float, float]:
    """
    Generate the lower and upper bound for outliers.

    Extra:
    ```
    Lower bound: < Q1 - 1.5 * IQR
    Upper bound: > Q3 + 1.5 * IQR
    ```

    Args:
        values: List of numerical values

    Returns:
        Tuple of (lower-cutoff, upper-cutoff)
    """
    q1 = np.quantile(values, 0.25)
    q3 = np.quantile(values, 0.75)
    iqr = q3 - q1
    lower_threshold = q1 - 1.5 * iqr
    upper_threshold = q3 + 1.5 * iqr
    return lower_threshold, upper_threshold

`sorted_classification_report(y_true, y_pred, **kwargs)`

Generate class-wise classification report sorted from worst to best.

Parameters:

Name	Type	Description	Default
`y_true`		Actual labels	required
`y_pred`		Predicted labels	required

Returns:

Type	Description
`DataFrame`	Classification report in sorted form.

Source code in fns/metrics.py

def sorted_classification_report(y_true, y_pred, **kwargs) -> pd.DataFrame:
    """
    Generate class-wise classification report sorted from worst to best.

    Args:
        y_true: Actual labels
        y_pred: Predicted labels

    Returns:
        Classification report in sorted form.
    """
    base_report = M.classification_report(y_true, y_pred, output_dict=True, **kwargs)
    base_report_df = pd.DataFrame.from_dict(base_report).T
    class_wise_df = base_report_df.iloc[:-3].sort_values(by="f1-score")
    summary_df = base_report_df.iloc[-3:]
    combined_df = pd.concat([class_wise_df, summary_df])
    combined_df["support"] = combined_df["support"].astype(int)
    return combined_df

Functions

`grid_report(cv)`

Display results from cross-validation.

Parameters:

Name	Type	Description	Default
`cv`		Result of cross-validation	required

Returns:

Type	Description
`DataFrame`	Pandas DataFrame

Source code in fns/model_selection.py

def view_result_table(cv) -> pd.DataFrame:
    """
    Display results from cross-validation.

    Args:
        cv: Result of cross-validation

    Returns:
        Pandas DataFrame
    """
    columns = ["params", "mean_test_score", "std_test_score", "rank_test_score"]
    return pd.DataFrame(cv.cv_results_)[columns].sort_values(by=["rank_test_score"])

`train_val_size(dataset, val_ratio=0.1)`

Return the train and validation data sizes based on split ratio.

Parameters:

Name	Type	Description	Default
`dataset`		A python collection	required
`val_ratio`	`float`	Ratio for validation dataset	`0.1`

Returns:

Type	Description
`Tuple[int, int]`	Tuple of number of rows for (training, validation)

Source code in fns/model_selection.py

def train_val_size(dataset, val_ratio: float = 0.1) -> Tuple[int, int]:
    """
    Return the train and validation data sizes based on split ratio.

    Args:
        dataset: A python collection
        val_ratio: Ratio for validation dataset

    Returns:
        Tuple of number of rows for (training, validation)
    """
    val_size = int(val_ratio * len(dataset))
    train_size = len(dataset) - val_size
    return train_size, val_size

`view_result_table(cv)`

Display results from cross-validation.

Parameters:

Name	Type	Description	Default
`cv`		Result of cross-validation	required

Returns:

Type	Description
`DataFrame`	Pandas DataFrame

Source code in fns/model_selection.py

def view_result_table(cv) -> pd.DataFrame:
    """
    Display results from cross-validation.

    Args:
        cv: Result of cross-validation

    Returns:
        Pandas DataFrame
    """
    columns = ["params", "mean_test_score", "std_test_score", "rank_test_score"]
    return pd.DataFrame(cv.cv_results_)[columns].sort_values(by=["rank_test_score"])

Functions

`validate_multiple_labels(y_raw)`

Validate binarization of labels in a multi-label setting.

Parameters:

Name	Type	Description	Default
`y_raw`	`List[List]`	Raw list of list of labels.	required

Returns:

Type	Description
`None`

Source code in fns/multi_label.py

def validate_multiple_labels(y_raw: List[List]) -> None:
    """
    Validate binarization of labels in a multi-label setting.

    Args:
        y_raw: Raw list of list of labels.

    Returns:

    """
    y = MultiLabelBinarizer().fit_transform(y_raw)

    # Assert that every sample has atleast one label
    assert (y.sum(axis=1) == 0).sum() == 0

    # Assert that every label is assigned to some data point
    assert (y.sum(axis=0) == 0).sum() == 0

    # Assert that no label is assigned to only one data point
    assert not (y.sum(axis=0) == 1).any()

Functions

`download(file_path)`

Download a file at given path.

Parameters:

Name	Type	Description	Default
`file_path`		File path	required

Returns:

Type	Description
`None`	None

Source code in fns/notebook.py

def download(file_path) -> None:
    """
    Download a file at given path.

    Args:
        file_path: File path

    Returns:
        None
    """
    from IPython.display import Javascript

    script = f"""
            var host = window.location.host;
            var downloadLink = window.location.protocol + "//" + host + "/files/{file_path}"
            window.open(downloadLink)
            """
    return Javascript(script)

`download_df(df, csv_path=None)`

Download a dataframe as a CSV with a random filename.

The filename is set to a random UUID.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Pandas DataFrame	required
`csv_path`		CSV filename.	`None`

Returns:

Type	Description
`None`	None

Source code in fns/notebook.py

def download_df(df: pd.DataFrame, csv_path=None) -> None:
    """
    Download a dataframe as a CSV with a random filename.

    The filename is set to a random UUID.

    Args:
        df: Pandas DataFrame
        csv_path: CSV filename.

    Returns:
        None
    """
    from IPython.display import display

    if not csv_path:
        from uuid import uuid4

        csv_path = f"{uuid4()}.csv"
    df.to_csv(csv_path, index=False)
    display(download(file_path=csv_path))
    time.sleep(1)
    Path(csv_path).unlink()

`filter_column(df, column_name)`

Show an interactive widget to filter a column in dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Pandas DataFrame	required
`column_name`	`str`	Column Name of the DataFrame	required

Returns:

Type	Description
`None`	Interactive widget for filtering.

Source code in fns/notebook.py

def filter_column(df: pd.DataFrame, column_name: str) -> None:
    """
    Show an interactive widget to filter a column in dataframe.

    Args:
        df: Pandas DataFrame
        column_name: Column Name of the DataFrame

    Returns:
        Interactive widget for filtering.
    """

    from ipywidgets import interact

    options = sorted(df[column_name].unique())
    interact(lambda value: df[df[column_name] == value], value=options)

`highlight_phrases(original_text, phrases, color_palette='Greens', weight=0.2)`

Highlight a list of phrases in a text.

Parameters:

Name	Type	Description	Default
`original_text`	`str`	Sentence	required
`phrases`	`Union[List[str], str]`	A single phrase or a list of phrases	required
`color_palette`	`str`	Any valid matplotlib color palette name	`'Greens'`
`weight`	`float`	Darkness of the color	`0.2`

Returns:

Type	Description
`None`	None

Source code in fns/notebook.py

def highlight_phrases(
    original_text: str,
    phrases: Union[List[str], str],
    color_palette: str = "Greens",
    weight: float = 0.2,
) -> None:
    """
    Highlight a list of phrases in a text.

    Args:
        original_text: Sentence
        phrases: A single phrase or a list of phrases
        color_palette: Any valid matplotlib color palette name
        weight: Darkness of the color

    Returns:
        None
    """
    import matplotlib.cm
    from IPython.display import HTML, display

    html = original_text
    cmap = matplotlib.cm.get_cmap(color_palette)
    color = f"rgba{cmap(weight, bytes=True)}"
    if type(phrases) is str:
        phrases = [phrases]
    for phrase in phrases:
        highlighted_phrase = (
            f'<span style="background-color: {color}; font-weight: {weight * 800};">'
            f"{phrase}"
            f"</span>"
        )
        html = html.replace(phrase, highlighted_phrase)
    display(HTML(f'<p style="color: #444; font-size:1.5em;">{html}</p>'))

`print_bullets(lines)`

Display a list of text as bullet points.

Parameters:

Name	Type	Description	Default
`lines`	`List[str]`	List of texts	required

Returns:

Type	Description
`None`	None

Source code in fns/notebook.py

def print_bullets(lines: List[str]) -> None:
    """
    Display a list of text as bullet points.

    Args:
        lines: List of texts

    Returns:
        None
    """
    bullet_points = "\n".join(f"- `{line}`" for line in sorted(lines))
    print_markdown(bullet_points)

`print_header(text, level=2)`

Display a text as markdown header.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text	required
`level`	`int`	2 for H2, 3 for H3 upto 6.	`2`

Returns:

Type	Description
`None`	None

Source code in fns/notebook.py

def print_header(text: str, level: int = 2) -> None:
    """
    Display a text as markdown header.

    Args:
        text: Text
        level: 2 for H2, 3 for H3 upto 6.

    Returns:
        None
    """
    print_markdown(f'{"#" * level} {text}')

`search_dataframe(df)`

Show an interactive widget to search text fields of a dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Pandas DataFrame	required

Returns:

Type	Description
`None`	Interactive widget for searching.

Source code in fns/notebook.py

def search_dataframe(df: pd.DataFrame) -> None:
    """
    Show an interactive widget to search text fields of a dataframe.

    Args:
        df: Pandas DataFrame

    Returns:
        Interactive widget for searching.
    """

    from ipywidgets import interact
    from IPython.display import display

    def _search(query: str, column: str):
        if query:
            with pd.option_context(
                "display.max_rows", None, "display.max_columns", None
            ):
                filtered_df = df[
                    df[column].str.contains(query, case=False, regex=False)
                ]
                display(filtered_df)

    string_columns = df.select_dtypes("object").columns.tolist()
    interact(_search, query="", column=string_columns)

`show_examples(df, group_column, data_column, n=5)`

Show random examples for each sub-group in a dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Dataframe	required
`group_column`	`str`	Column name for performing group by	required
`data_column`	`str`	Column to show examples for	required
`n`	`int`	Number of examples	`5`

Returns:

Type	Description
	Markdown

Source code in fns/notebook.py

def show_examples(df: pd.DataFrame, group_column: str, data_column: str, n: int = 5):
    """
    Show random examples for each sub-group in a dataframe.

    Args:
        df: Dataframe
        group_column: Column name for performing group by
        data_column: Column to show examples for
        n: Number of examples

    Returns:
        Markdown
    """
    from IPython.display import Markdown

    generated_text = ""
    for group_name, subset in df.explode(group_column).groupby(group_column):
        examples = subset[data_column].sample(n)
        generated_text += f"## {group_name}\n\n"
        generated_text += "\n".join([f"- {example}" for example in examples])
        generated_text += "\n\n"
    return Markdown(generated_text)

Functions

`confusion_matrix_plot(y_true, y_pred)`

Plot a confusion matrix.

Parameters:

Name	Type	Description	Default
`y_true`		List of true labels	required
`y_pred`		List of prediction labels	required

Returns:

Type	Description
`None`

Source code in fns/plot.py

def confusion_matrix_plot(y_true, y_pred) -> None:
    """
    Plot a confusion matrix.

    Args:
        y_true: List of true labels
        y_pred: List of prediction labels

    Returns:
    """
    from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

    cm = confusion_matrix(y_true, y_pred)
    plot = ConfusionMatrixDisplay(confusion_matrix=cm).plot()
    plot.ax_.set_title("Confusion Matrix")

Functions

`combine_hyphenated_word(text)`

Combine words in text that contain hyphen.

Example: e-email to email

Parameters:

Name	Type	Description	Default
`text`	`str`	A sentence	required

Returns:

Type	Description
`str`	Processed sentence

Source code in fns/preprocessing.py

def combine_hyphenated_word(text: str) -> str:
    """
    Combine words in text that contain hyphen.

    Example: e-email to email

    Args:
        text: A sentence

    Returns:
        Processed sentence
    """
    return " ".join(
        w.replace("-", "") if _re_hyphen_word.match(w) else w for w in text.split()
    )

`normalize_json(json_data)`

Convert any non-standard types in dictionary to basic types.

The normalization prevent errors during serialization.

Usage:

>>> normalize_json({'nums': np.array([1, 2, 3]})
{'nums': [1, 2, 3]}

Parameters:

Name	Type	Description	Default
`json_data`	`Dict`	Dictionary	required

Returns:

Type	Description
`Dict`	Normalized dictionary

Source code in fns/preprocessing.py

def normalize_json(json_data: Dict) -> Dict:
    """
    Convert any non-standard types in dictionary to basic types.

    The normalization prevent errors during serialization.

    Usage:
    ```python
    >>> normalize_json({'nums': np.array([1, 2, 3]})
    {'nums': [1, 2, 3]}
    ```

    Args:
        json_data: Dictionary

    Returns:
        Normalized dictionary
    """
    return json.loads(json.dumps(json_data, cls=NpEncoder))

`remove_hashtag(t)`

Remove hashtag from the text.

Parameters:

Name	Type	Description	Default
`t`	`str`	Text	required

Returns:

Type	Description
`str`	Text without hashtag

Source code in fns/preprocessing.py

def remove_hashtag(t: str) -> str:
    """
    Remove hashtag from the text.

    Args:
        t: Text

    Returns:
        Text without hashtag
    """
    return _re_hashtag.sub("", t)

`remove_hyperlink(t)`

Remove hyperlinks from a text.

Parameters:

Name	Type	Description	Default
`t`	`str`	Text	required

Returns:

Type	Description
`str`	Text without hyperlinks.

Source code in fns/preprocessing.py

def remove_hyperlink(t: str) -> str:
    """
    Remove hyperlinks from a text.

    Args:
        t: Text

    Returns:
        Text without hyperlinks.
    """
    return _re_hyperlink.sub("", t)

`remove_multiple_commas(t)`

Substitute multiple consecutive commas with a single comma.

Usage:

>>> remove_multiple_commas('a,,b,c')
'a,b,c'

Parameters:

Name	Type	Description	Default
`t`	`str`	Text	required

Returns:

Type	Description
`str`	Text without multiple commas.

Source code in fns/preprocessing.py

def remove_multiple_commas(t: str) -> str:
    """
    Substitute multiple consecutive commas with a single comma.

    Usage:
    ```python
    >>> remove_multiple_commas('a,,b,c')
    'a,b,c'
    ```

    Args:
        t: Text

    Returns:
        Text without multiple commas.
    """
    return _re_comma.sub(",", t)

`remove_multiple_space(t)`

Remove multiple spaces from the text.

Adapted from: https://github.com/fastai/fastai/blob/master/fastai/text/core.py

Parameters:

Name	Type	Description	Default
`t`	`str`	Text	required

Returns:

Type	Description
`str`	Text without multiple space.

Source code in fns/preprocessing.py

def remove_multiple_space(t: str) -> str:
    """
    Remove multiple spaces from the text.

    Adapted from: https://github.com/fastai/fastai/blob/master/fastai/text/core.py

    Args:
        t: Text

    Returns:
        Text without multiple space.
    """
    return _re_space.sub(" ", t)

`remove_new_lines(text)`

Strip away new lines at end.

Parameters:

Name	Type	Description	Default
`t`		Text	required

Returns:

Type	Description
`str`	Text without newline at end.

Source code in fns/preprocessing.py

def remove_new_lines(text: str) -> str:
    """
    Strip away new lines at end.

    Args:
        t: Text

    Returns:
        Text without newline at end.
    """
    if isinstance(text, str):
        return text.replace("\\n", "").strip()
    return text

`remove_punctuation(text)`

Remove all punctuations from a text.

Parameters:

Name	Type	Description	Default
`text`	`str`	Sentence	required

Source code in fns/preprocessing.py

def remove_punctuation(text: str) -> str:
    """
    Remove all punctuations from a text.

    Args:
        text: Sentence

    """
    return "".join(t for t in text if t not in string.punctuation)

`remove_retweet(t)`

Remove RT from the text.

Parameters:

Name	Type	Description	Default
`t`	`str`	Text	required

Returns:

Type	Description
`str`	Text without RT symbol.

Source code in fns/preprocessing.py

def remove_retweet(t: str) -> str:
    """
    Remove RT from the text.

    Args:
        t: Text

    Returns:
        Text without RT symbol.
    """
    return _re_retweet.sub("", t)

`remove_separator(text)`

Keep only alphabet, number and space.

Parameters:

Name	Type	Description	Default
`text`	`str`		required

Returns:

Type	Description
`str`

Source code in fns/preprocessing.py

def remove_separator(text: str) -> str:
    """
    Keep only alphabet, number and space.

    Args:
        text:

    Returns:

    """
    no_separator_regex = re.compile(r"[^a-zA-Z0-9\s]")
    return no_separator_regex.sub("", text)

Functions

`create_download_link(dataframe, filename, file_type='csv', index=False, header=True)`

Generate a download link for a pandas dataframe.

Parameters:

Name	Type	Description	Default
`dataframe`		Pandas DataFrame	required
`filename`	`str`	Name of exported file	required
`file_type`	`str`	Either 'csv' or 'tsv'	`'csv'`
`index`	`bool`	Whether to include index of dataframe or not	`False`
`header`	`bool`	Whether to include header of dataframe or not	`True`

Returns:

Type	Description
	Markdown to place in st.markdown(...)

Source code in fns/streamlit_utils.py

def create_download_link(
    dataframe,
    filename: str,
    file_type: str = "csv",
    index: bool = False,
    header: bool = True,
):
    """
    Generate a download link for a pandas dataframe.

    Args:
        dataframe: Pandas DataFrame
        filename: Name of exported file
        file_type: Either 'csv' or 'tsv'
        index: Whether to include index of dataframe or not
        header: Whether to include header of dataframe or not

    Returns:
        Markdown to place in st.markdown(...)
    """
    if file_type == "csv":
        dataframe_csv = dataframe.to_csv(index=index)
    elif file_type == "tsv":
        dataframe_csv = dataframe.to_csv(
            index=index, sep="\t", header=header, quoting=csv.QUOTE_NONNUMERIC
        )
    else:
        raise Exception('Invalid file_type. Allowed values are "csv" and "tsv".')

    b64 = base64.b64encode(dataframe_csv.encode()).decode()
    href = f'**DOWNLOAD:** <a href="data:file/csv;base64,{b64}" download="{filename}">{filename}</a>'
    return href

Functions

`export_fasttext_format(texts, labels, filename)`

Export training data to a fasttext compatible format.

Format: __label__POSITIVE it was good

Parameters:

Name	Type	Description	Default
`texts`	`List[str]`	List of sentences	required
`labels`	`Union[List[str], List[List[str]]]`	List of single or multi-label classes	required
`filename`		Exported filename	required

Returns:

Type	Description
`None`	None

Source code in fns/text.py

def export_fasttext_format(
    texts: List[str], labels: Union[List[str], List[List[str]]], filename
) -> None:
    """
    Export training data to a fasttext compatible format.

    Format:
    __label__POSITIVE it was good

    Args:
        texts: List of sentences
        labels: List of single or multi-label classes
        filename: Exported filename

    Returns:
        None
    """
    output = []
    for text, text_label in zip(texts, labels):
        if type(text_label) is str:
            text_label = [text_label]
        labels = " ".join([f"__label__{label}" for label in text_label])
        output.append(f"{labels} {text}\n")
    with open(filename, "w") as fp:
        fp.writelines(output)

`extract_abbreviations(texts)`

Get a list of all-capitalized words.

Example: WWW, HTTP, etc.

Parameters:

Name	Type	Description	Default
`texts`	`List[str]`	List of sentences	required

Returns:

Type	Description
`List[str]`	List of abbreviations

Source code in fns/text.py

def extract_abbreviations(texts: List[str]) -> List[str]:
    """
    Get a list of all-capitalized words.

    Example: WWW, HTTP, etc.

    Args:
        texts: List of sentences

    Returns:
        List of abbreviations
    """
    combined_text = "\n".join(texts)
    symbols = re.findall(r"\b[A-Z][A-Z]+\b", combined_text)
    return list(set(symbols))

`extract_discriminative_keywords(df, category_column, text_column, ngram=2, n=10)`

Generate discriminative keywords for texts in each category.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Dataframe with text and category columns.	required
`text_column`	`str`	Column name containing texts	required
`category_column`	`str`	Column name for the text category	required
`ngram`	`int`	1 for words, 2 for bigram and so on.	`2`
`n`	`int`	Number of keywords to return.	`10`

Returns:

Type	Description
`DataFrame`	Dataframe with categories in columns and top-n keywords in each columns.

Source code in fns/text.py

def extract_discriminative_keywords(
    df: pd.DataFrame,
    category_column: str,
    text_column: str,
    ngram: int = 2,
    n: int = 10,
) -> pd.DataFrame:
    """
    Generate discriminative keywords for texts in each category.

    Args:
        df: Dataframe with text and category columns.
        text_column: Column name containing texts
        category_column: Column name for the text category
        ngram: 1 for words, 2 for bigram and so on.
        n: Number of keywords to return.

    Returns:
        Dataframe with categories in columns and top-n keywords in each columns.
    """
    # Combine all texts into a single document for each category
    category_docs = df.groupby(by=category_column)[text_column].apply(" ".join)
    categories = category_docs.index.tolist()

    tfidf = TfidfVectorizer(
        ngram_range=(1, ngram),
        stop_words="english",
        strip_accents="unicode",
        sublinear_tf=True,
    )
    document_vectors = tfidf.fit_transform(category_docs).A
    keywords = np.array(tfidf.get_feature_names())
    top_terms = document_vectors.argsort(axis=1)[:, :n]
    return pd.DataFrame(keywords[top_terms].T, columns=categories)

`extract_tfidf_keywords(texts, ngram=2, n=10)`

Get top keywords based on mean tf-idf term score.

Parameters:

Name	Type	Description	Default
`texts`	`List[str]`	List of sentences	required
`ngram`	`int`	1 for words, 2 for bigram and so on.	`2`
`n`	`int`	Number of keywords to extract	`10`

Returns:

Type	Description
`List[str]`	Keywords

Source code in fns/text.py

def extract_tfidf_keywords(texts: List[str], ngram: int = 2, n: int = 10) -> List[str]:
    """
    Get top keywords based on mean tf-idf term score.

    Args:
        texts: List of sentences
        ngram: 1 for words, 2 for bigram and so on.
        n: Number of keywords to extract

    Returns:
        Keywords
    """
    tfidf = TfidfVectorizer(
        ngram_range=(1, ngram),
        stop_words="english",
        strip_accents="unicode",
        sublinear_tf=True,
    )
    vectors = tfidf.fit_transform(texts)
    term_tfidf = vectors.A.mean(axis=0)
    terms = np.array(tfidf.get_feature_names())
    return terms[term_tfidf.argsort()[::-1]][:n].tolist()

`is_non_ascii(text)`

Check if text has non-ascci characters.

Useful heuristic to find text containing emojis and non-english characters.

Parameters:

Name	Type	Description	Default
`text`	`str`	Sentence	required

Returns:

Type	Description
`bool`	True if the text contains non-ascii characters.

Source code in fns/text.py

def is_non_ascii(text: str) -> bool:
    """
    Check if text has non-ascci characters.

    Useful heuristic to find text containing emojis and non-english
    characters.

    Args:
        text: Sentence

    Returns:
        True if the text contains non-ascii characters.
    """
    try:
        text.encode("ascii")
        return False
    except UnicodeEncodeError:
        return True

`md5_hash(text)`

Generate MD5 hash of a text.

Parameters:

Name	Type	Description	Default
`text`	`str`	String	required

Returns:

Type	Description
`str`	MD5 hash

Source code in fns/text.py

def md5_hash(text: str) -> str:
    """
    Generate MD5 hash of a text.

    Args:
        text: String

    Returns:
        MD5 hash
    """
    return hashlib.md5(text.encode("utf-8")).hexdigest()

`num_words(text)`

Counts the number of words using whitespace as delimiter.

Parameters:

Name	Type	Description	Default
`text`	`str`	Sentence	required

Returns:

Type	Description
`int`	Number of words

Source code in fns/text.py

def num_words(text: str) -> int:
    """
    Counts the number of words using whitespace as delimiter.

    Args:
        text: Sentence

    Returns:
        Number of words
    """
    return len(text.split())

`offset_by_one(x, sequence_length=3)`

Generate a list of small sequences offset by 1.

Usage:

>>> offset_by_one([1, 2, 3, 4, 5], sequence_length=3)
[([1, 2, 3], [2, 3, 4])]

Parameters:

Name	Type	Description	Default
`x`		Python list	required
`sequence_length`	`int`	Chunk size	`3`

Source code in fns/text.py

def offset_by_one(x, sequence_length: int = 3):
    """
    Generate a list of small sequences offset by 1.

    Usage:

    ```python
    >>> offset_by_one([1, 2, 3, 4, 5], sequence_length=3)
    [([1, 2, 3], [2, 3, 4])]
    ```

    Args:
        x: Python list
        sequence_length: Chunk size

    Returns:

    """
    sl = sequence_length
    return [
        (x[i : i + sl], x[i + 1 : i + sl + 1]) for i in range(0, len(x) - sl - 1, sl)
    ]

`sha256hash(text)`

Generate MD5 hash of a text.

Parameters:

Name	Type	Description	Default
`text`	`str`	String	required

Returns:

Type	Description
`str`	SHA256 hash

Source code in fns/text.py

def sha256hash(text: str) -> str:
    """
    Generate MD5 hash of a text.

    Args:
        text: String

    Returns:
        SHA256 hash
    """
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

`span_positions(text, phrases)`

Find span position of phrases in a text.

Parameters:

Name	Type	Description	Default
`text`	`str`	Sentence	required
`phrases`	`List[str]`	List of phrases	required

Returns:

Type	Description
`List[Tuple[int, int]]`	List of span positions for each phrase. The span position is a tuple of start and end index.

Source code in fns/text.py

def span_positions(text: str, phrases: List[str]) -> List[Tuple[int, int]]:
    """
    Find span position of phrases in a text.

    Args:
        text: Sentence
        phrases: List of phrases

    Returns:
        List of span positions for each phrase.
        The span position is a tuple of start and end index.
    """
    capture_group = "|".join([re.escape(phrase) for phrase in phrases])
    reg = re.compile(rf"\b({capture_group})\b", flags=re.IGNORECASE)
    return [match.span() for match in reg.finditer(text)]

`unique_chars(texts)`

Get a list of unique characters from list of text.

Parameters:

Name	Type	Description	Default
`texts`	`List[str]`	List of sentences	required

Returns:

Type	Description
`List[str]`	A sorted list of unique characters

Source code in fns/text.py

def unique_chars(texts: List[str]) -> List[str]:
    """
    Get a list of unique characters from list of text.

    Args:
        texts: List of sentences

    Returns:
        A sorted list of unique characters
    """
    return sorted(set("".join(texts)))

`window(tokens, size=3)`

Generate samples for a window size.

Examples:

>>> window(['a', 'b', 'c', 'd'], size=2)
[(['a', 'b'], 'c'), (['b', 'c'], 'd')]

Parameters:

Name	Type	Description	Default
`tokens`		List of tokens	required
`size`	`int`	Window size	`3`

Returns:

Type	Description
	List of windowed samples

Source code in fns/text.py

def window(tokens, size: int = 3):
    """
    Generate samples for a window size.

    Example:
    ```python
    >>> window(['a', 'b', 'c', 'd'], size=2)
    [(['a', 'b'], 'c'), (['b', 'c'], 'd')]
    ```

    Args:
        tokens: List of tokens
        size: Window size

    Returns:
        List of windowed samples
    """
    return [
        (tokens[i : i + size], tokens[i + size])
        for i in range(0, len(tokens) - size, 1)
    ]

Functions

`imagenet_index_to_class()`

Get a mapping from imagenet class index to class names.

Returns:

Type	Description
`Dict[int, str]`	Mapping from imagenet class index to class names

Source code in fns/vision.py

def imagenet_index_to_class() -> Dict[int, str]:
    """
    Get a mapping from imagenet class index to class names.

    Returns:
        Mapping from imagenet class index to class names
    """
    raw_mapping = json.load(urlopen(IMAGENET_LABEL_TO_CLASS_URL))
    return {int(index): class_name for index, class_name in raw_mapping.items()}