MatrixSynapse/synapse/util/iterutils.py

# Copyright 2014-2016 OpenMarket Ltd
# Copyright 2020 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import heapq
from itertools import islice
from typing import (
    Callable,
    Collection,
    Dict,
    Generator,
    Iterable,
    Iterator,
    List,
    Mapping,
    Set,
    Sized,
    Tuple,
    TypeVar,
)

from typing_extensions import Protocol

T = TypeVar("T")
S = TypeVar("S", bound="_SelfSlice")


class _SelfSlice(Sized, Protocol):
    """A helper protocol that matches types where taking a slice results in the
    same type being returned.

    This is more specific than `Sequence`, which allows another `Sequence` to be
    returned.
    """

    def __getitem__(self: S, i: slice) -> S:
        ...


def batch_iter(iterable: Iterable[T], size: int) -> Iterator[Tuple[T, ...]]:
    """batch an iterable up into tuples with a maximum size

    Args:
        iterable: the iterable to slice
        size: the maximum batch size

    Returns:
        an iterator over the chunks
    """
    # make sure we can deal with iterables like lists too
    sourceiter = iter(iterable)
    # call islice until it returns an empty tuple
    return iter(lambda: tuple(islice(sourceiter, size)), ())


def chunk_seq(iseq: S, maxlen: int) -> Iterator[S]:
    """Split the given sequence into chunks of the given size

    The last chunk may be shorter than the given size.

    If the input is empty, no chunks are returned.
    """
    return (iseq[i : i + maxlen] for i in range(0, len(iseq), maxlen))


def partition(
    iterable: Iterable[T], predicate: Callable[[T], bool]
) -> Tuple[List[T], List[T]]:
    """
    Separate a given iterable into two lists based on the result of a predicate function.

    Args:
        iterable: the iterable to partition (separate)
        predicate: a function that takes an item from the iterable and returns a boolean

    Returns:
        A tuple of two lists, the first containing all items for which the predicate
        returned True, the second containing all items for which the predicate returned
        False
    """
    true_results = []
    false_results = []
    for item in iterable:
        if predicate(item):
            true_results.append(item)
        else:
            false_results.append(item)
    return true_results, false_results


def sorted_topologically(
    nodes: Iterable[T],
    graph: Mapping[T, Collection[T]],
) -> Generator[T, None, None]:
    """Given a set of nodes and a graph, yield the nodes in toplogical order.

    For example `sorted_topologically([1, 2], {1: [2]})` will yield `2, 1`.
    """

    # This is implemented by Kahn's algorithm.

    degree_map = {node: 0 for node in nodes}
    reverse_graph: Dict[T, Set[T]] = {}

    for node, edges in graph.items():
        if node not in degree_map:
            continue

        for edge in set(edges):
            if edge in degree_map:
                degree_map[node] += 1

            reverse_graph.setdefault(edge, set()).add(node)
        reverse_graph.setdefault(node, set())

    zero_degree = [node for node, degree in degree_map.items() if degree == 0]
    heapq.heapify(zero_degree)

    while zero_degree:
        node = heapq.heappop(zero_degree)
        yield node

        for edge in reverse_graph.get(node, []):
            if edge in degree_map:
                degree_map[edge] -= 1
                if degree_map[edge] == 0:
                    heapq.heappush(zero_degree, edge)


def sorted_topologically_batched(
    nodes: Iterable[T],
    graph: Mapping[T, Collection[T]],
) -> Generator[Collection[T], None, None]:
    r"""Walk the graph topologically, returning batches of nodes where all nodes
    that references it have been previously returned.

    For example, given the following graph:

         A
        / \
       B   C
        \ /
         D

    This function will return: `[[A], [B, C], [D]]`.

    This function is useful for e.g. batch persisting events in an auth chain,
    where we can only persist an event if all its auth events have already been
    persisted.
    """

    degree_map = {node: 0 for node in nodes}
    reverse_graph: Dict[T, Set[T]] = {}

    for node, edges in graph.items():
        if node not in degree_map:
            continue

        for edge in set(edges):
            if edge in degree_map:
                degree_map[node] += 1

            reverse_graph.setdefault(edge, set()).add(node)
        reverse_graph.setdefault(node, set())

    zero_degree = [node for node, degree in degree_map.items() if degree == 0]

    while zero_degree:
        new_zero_degree = []
        for node in zero_degree:
            for edge in reverse_graph.get(node, []):
                if edge in degree_map:
                    degree_map[edge] -= 1
                    if degree_map[edge] == 0:
                        new_zero_degree.append(edge)

        yield zero_degree
        zero_degree = new_zero_degree
move batch_iter to a separate module 2020-01-14 12:58:02 +01:00			`# Copyright 2014-2016 OpenMarket Ltd`
			`# Copyright 2020 The Matrix.org Foundation C.I.C.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
Use a chain cover index to efficiently calculate auth chain difference (#8868) 2021-01-11 17:09:22 +01:00			`import heapq`
move batch_iter to a separate module 2020-01-14 12:58:02 +01:00			`from itertools import islice`
Use a chain cover index to efficiently calculate auth chain difference (#8868) 2021-01-11 17:09:22 +01:00			`from typing import (`
Process previously failed backfill events in the background (#15585) Process previously failed backfill events in the background because they are bound to fail again and we don't need to waste time holding up the request for something that is bound to fail again. Fix https://github.com/matrix-org/synapse/issues/13623 Follow-up to https://github.com/matrix-org/synapse/issues/13621 and https://github.com/matrix-org/synapse/issues/13622 Part of making `/messages` faster: https://github.com/matrix-org/synapse/issues/13356 2023-05-25 06:22:24 +02:00			`Callable,`
Remove `synapse.types.Collection` (#9856) This is no longer required, since we have dropped support for Python 3.5. 2021-04-22 17:43:50 +02:00			`Collection,`
Use a chain cover index to efficiently calculate auth chain difference (#8868) 2021-01-11 17:09:22 +01:00			`Dict,`
			`Generator,`
			`Iterable,`
			`Iterator,`
Process previously failed backfill events in the background (#15585) Process previously failed backfill events in the background because they are bound to fail again and we don't need to waste time holding up the request for something that is bound to fail again. Fix https://github.com/matrix-org/synapse/issues/13623 Follow-up to https://github.com/matrix-org/synapse/issues/13621 and https://github.com/matrix-org/synapse/issues/13622 Part of making `/messages` faster: https://github.com/matrix-org/synapse/issues/13356 2023-05-25 06:22:24 +02:00			`List,`
Use a chain cover index to efficiently calculate auth chain difference (#8868) 2021-01-11 17:09:22 +01:00			`Mapping,`
			`Set,`
Encode JSON responses on a thread in C, mk2 (#10905) Currently we use `JsonEncoder.iterencode` to write JSON responses, which ensures that we don't block the main reactor thread when encoding huge objects. The downside to this is that `iterencode` falls back to using a pure Python encoder that is much less efficient and can easily burn a lot of CPU for huge responses. To fix this, while still ensuring we don't block the reactor loop, we encode the JSON on a threadpool using the standard `JsonEncoder.encode` functions, which is backed by a C library. Doing so, however, requires `respond_with_json` to have access to the reactor, which it previously didn't. There are two ways of doing this: 1. threading through the reactor object, which is a bit fiddly as e.g. `DirectServeJsonResource` doesn't currently take a reactor, but is exposed to modules and so is a PITA to change; or 2. expose the reactor in `SynapseRequest`, which requires updating a bunch of servlet types. I went with the latter as that is just a mechanical change, and I think makes sense as a request already has a reactor associated with it (via its http channel). 2021-09-28 11:37:58 +02:00			`Sized,`
Use a chain cover index to efficiently calculate auth chain difference (#8868) 2021-01-11 17:09:22 +01:00			`Tuple,`
			`TypeVar,`
			`)`

Encode JSON responses on a thread in C, mk2 (#10905) Currently we use `JsonEncoder.iterencode` to write JSON responses, which ensures that we don't block the main reactor thread when encoding huge objects. The downside to this is that `iterencode` falls back to using a pure Python encoder that is much less efficient and can easily burn a lot of CPU for huge responses. To fix this, while still ensuring we don't block the reactor loop, we encode the JSON on a threadpool using the standard `JsonEncoder.encode` functions, which is backed by a C library. Doing so, however, requires `respond_with_json` to have access to the reactor, which it previously didn't. There are two ways of doing this: 1. threading through the reactor object, which is a bit fiddly as e.g. `DirectServeJsonResource` doesn't currently take a reactor, but is exposed to modules and so is a PITA to change; or 2. expose the reactor in `SynapseRequest`, which requires updating a bunch of servlet types. I went with the latter as that is just a mechanical change, and I think makes sense as a request already has a reactor associated with it (via its http channel). 2021-09-28 11:37:58 +02:00			`from typing_extensions import Protocol`

move batch_iter to a separate module 2020-01-14 12:58:02 +01:00			`T = TypeVar("T")`
Encode JSON responses on a thread in C, mk2 (#10905) Currently we use `JsonEncoder.iterencode` to write JSON responses, which ensures that we don't block the main reactor thread when encoding huge objects. The downside to this is that `iterencode` falls back to using a pure Python encoder that is much less efficient and can easily burn a lot of CPU for huge responses. To fix this, while still ensuring we don't block the reactor loop, we encode the JSON on a threadpool using the standard `JsonEncoder.encode` functions, which is backed by a C library. Doing so, however, requires `respond_with_json` to have access to the reactor, which it previously didn't. There are two ways of doing this: 1. threading through the reactor object, which is a bit fiddly as e.g. `DirectServeJsonResource` doesn't currently take a reactor, but is exposed to modules and so is a PITA to change; or 2. expose the reactor in `SynapseRequest`, which requires updating a bunch of servlet types. I went with the latter as that is just a mechanical change, and I think makes sense as a request already has a reactor associated with it (via its http channel). 2021-09-28 11:37:58 +02:00			`S = TypeVar("S", bound="_SelfSlice")`


			`class _SelfSlice(Sized, Protocol):`
			`"""A helper protocol that matches types where taking a slice results in the`
			`same type being returned.`

			This is more specific than `Sequence`, which allows another `Sequence` to be
			`returned.`
			`"""`

			`def __getitem__(self: S, i: slice) -> S:`
			`...`
move batch_iter to a separate module 2020-01-14 12:58:02 +01:00

Add missing type hints to synapse.util (#9982) 2021-05-24 21:32:01 +02:00			`def batch_iter(iterable: Iterable[T], size: int) -> Iterator[Tuple[T, ...]]:`
move batch_iter to a separate module 2020-01-14 12:58:02 +01:00			`"""batch an iterable up into tuples with a maximum size`

			`Args:`
Add missing type hints to synapse.util (#9982) 2021-05-24 21:32:01 +02:00			`iterable: the iterable to slice`
			`size: the maximum batch size`
move batch_iter to a separate module 2020-01-14 12:58:02 +01:00
			`Returns:`
			`an iterator over the chunks`
			`"""`
			`# make sure we can deal with iterables like lists too`
			`sourceiter = iter(iterable)`
			`# call islice until it returns an empty tuple`
			`return iter(lambda: tuple(islice(sourceiter, size)), ())`
Log saml assertions rather than the whole response ... since the whole response is huge. We even need to break up the assertions, since kibana otherwise truncates them. 2020-01-16 23:26:34 +01:00

Encode JSON responses on a thread in C, mk2 (#10905) Currently we use `JsonEncoder.iterencode` to write JSON responses, which ensures that we don't block the main reactor thread when encoding huge objects. The downside to this is that `iterencode` falls back to using a pure Python encoder that is much less efficient and can easily burn a lot of CPU for huge responses. To fix this, while still ensuring we don't block the reactor loop, we encode the JSON on a threadpool using the standard `JsonEncoder.encode` functions, which is backed by a C library. Doing so, however, requires `respond_with_json` to have access to the reactor, which it previously didn't. There are two ways of doing this: 1. threading through the reactor object, which is a bit fiddly as e.g. `DirectServeJsonResource` doesn't currently take a reactor, but is exposed to modules and so is a PITA to change; or 2. expose the reactor in `SynapseRequest`, which requires updating a bunch of servlet types. I went with the latter as that is just a mechanical change, and I think makes sense as a request already has a reactor associated with it (via its http channel). 2021-09-28 11:37:58 +02:00			`def chunk_seq(iseq: S, maxlen: int) -> Iterator[S]:`
Log saml assertions rather than the whole response ... since the whole response is huge. We even need to break up the assertions, since kibana otherwise truncates them. 2020-01-16 23:26:34 +01:00			`"""Split the given sequence into chunks of the given size`

			`The last chunk may be shorter than the given size.`

			`If the input is empty, no chunks are returned.`
			`"""`
			`return (iseq[i : i + maxlen] for i in range(0, len(iseq), maxlen))`
Use a chain cover index to efficiently calculate auth chain difference (#8868) 2021-01-11 17:09:22 +01:00

Process previously failed backfill events in the background (#15585) Process previously failed backfill events in the background because they are bound to fail again and we don't need to waste time holding up the request for something that is bound to fail again. Fix https://github.com/matrix-org/synapse/issues/13623 Follow-up to https://github.com/matrix-org/synapse/issues/13621 and https://github.com/matrix-org/synapse/issues/13622 Part of making `/messages` faster: https://github.com/matrix-org/synapse/issues/13356 2023-05-25 06:22:24 +02:00			`def partition(`
			`iterable: Iterable[T], predicate: Callable[[T], bool]`
			`) -> Tuple[List[T], List[T]]:`
			`"""`
			`Separate a given iterable into two lists based on the result of a predicate function.`

			`Args:`
			`iterable: the iterable to partition (separate)`
			`predicate: a function that takes an item from the iterable and returns a boolean`

			`Returns:`
			`A tuple of two lists, the first containing all items for which the predicate`
			`returned True, the second containing all items for which the predicate returned`
			`False`
			`"""`
			`true_results = []`
			`false_results = []`
			`for item in iterable:`
			`if predicate(item):`
			`true_results.append(item)`
			`else:`
			`false_results.append(item)`
			`return true_results, false_results`


Use a chain cover index to efficiently calculate auth chain difference (#8868) 2021-01-11 17:09:22 +01:00			`def sorted_topologically(`
Update black, and run auto formatting over the codebase (#9381) - Update black version to the latest - Run black auto formatting over the codebase - Run autoformatting according to [`docs/code_style.md `](https://github.com/matrix-org/synapse/blob/80d6dc9783aa80886a133756028984dbf8920168/docs/code_style.md) - Update `code_style.md` docs around installing black to use the correct version 2021-02-16 23:32:34 +01:00			`nodes: Iterable[T],`
			`graph: Mapping[T, Collection[T]],`
Use a chain cover index to efficiently calculate auth chain difference (#8868) 2021-01-11 17:09:22 +01:00			`) -> Generator[T, None, None]:`
			`"""Given a set of nodes and a graph, yield the nodes in toplogical order.`

			For example `sorted_topologically([1, 2], {1: [2]})` will yield `2, 1`.
			`"""`

			`# This is implemented by Kahn's algorithm.`

			`degree_map = {node: 0 for node in nodes}`
Use inline type hints in `http/federation/`, `storage/` and `util/` (#10381) 2021-07-15 18:46:54 +02:00			`reverse_graph: Dict[T, Set[T]] = {}`
Use a chain cover index to efficiently calculate auth chain difference (#8868) 2021-01-11 17:09:22 +01:00
			`for node, edges in graph.items():`
			`if node not in degree_map:`
			`continue`

Fix chain cover update to handle events with duplicate auth events (#9210) 2021-01-22 20:44:08 +01:00			`for edge in set(edges):`
Use a chain cover index to efficiently calculate auth chain difference (#8868) 2021-01-11 17:09:22 +01:00			`if edge in degree_map:`
			`degree_map[node] += 1`

			`reverse_graph.setdefault(edge, set()).add(node)`
			`reverse_graph.setdefault(node, set())`

			`zero_degree = [node for node, degree in degree_map.items() if degree == 0]`
			`heapq.heapify(zero_degree)`

			`while zero_degree:`
			`node = heapq.heappop(zero_degree)`
			`yield node`

Fix event chain bg update. (#9118) We passed in a graph to `sorted_topologically` which didn't have an entry for each node (as we dropped nodes with no edges). 2021-01-14 19:57:32 +01:00			`for edge in reverse_graph.get(node, []):`
Use a chain cover index to efficiently calculate auth chain difference (#8868) 2021-01-11 17:09:22 +01:00			`if edge in degree_map:`
			`degree_map[edge] -= 1`
			`if degree_map[edge] == 0:`
			`heapq.heappush(zero_degree, edge)`
Speed up persisting large number of outliers (#16649) Recalculating the roots tuple every iteration could be very expensive, so instead let's do a topological sort. 2023-11-16 15:25:35 +01:00

			`def sorted_topologically_batched(`
			`nodes: Iterable[T],`
			`graph: Mapping[T, Collection[T]],`
			`) -> Generator[Collection[T], None, None]:`
			`r"""Walk the graph topologically, returning batches of nodes where all nodes`
			`that references it have been previously returned.`

			`For example, given the following graph:`

			`A`
			`/ \`
			`B C`
			`\ /`
			`D`

			This function will return: `[[A], [B, C], [D]]`.

			`This function is useful for e.g. batch persisting events in an auth chain,`
			`where we can only persist an event if all its auth events have already been`
			`persisted.`
			`"""`

			`degree_map = {node: 0 for node in nodes}`
			`reverse_graph: Dict[T, Set[T]] = {}`

			`for node, edges in graph.items():`
			`if node not in degree_map:`
			`continue`

			`for edge in set(edges):`
			`if edge in degree_map:`
			`degree_map[node] += 1`

			`reverse_graph.setdefault(edge, set()).add(node)`
			`reverse_graph.setdefault(node, set())`

			`zero_degree = [node for node, degree in degree_map.items() if degree == 0]`

			`while zero_degree:`
			`new_zero_degree = []`
			`for node in zero_degree:`
			`for edge in reverse_graph.get(node, []):`
			`if edge in degree_map:`
			`degree_map[edge] -= 1`
			`if degree_map[edge] == 0:`
			`new_zero_degree.append(edge)`

			`yield zero_degree`
			`zero_degree = new_zero_degree`