Merge branch 'erikj/media_spam_checker' into develop
commit
adc96d4236
|
@ -0,0 +1 @@
|
||||||
|
Add hook to spam checker modules that allow checking file uploads and remote downloads.
|
|
@ -61,6 +61,9 @@ class ExampleSpamChecker:
|
||||||
|
|
||||||
async def check_registration_for_spam(self, email_threepid, username, request_info):
|
async def check_registration_for_spam(self, email_threepid, username, request_info):
|
||||||
return RegistrationBehaviour.ALLOW # allow all registrations
|
return RegistrationBehaviour.ALLOW # allow all registrations
|
||||||
|
|
||||||
|
async def check_media_file_for_spam(self, file_wrapper, file_info):
|
||||||
|
return False # allow all media
|
||||||
```
|
```
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
import inspect
|
import inspect
|
||||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from synapse.rest.media.v1._base import FileInfo
|
||||||
|
from synapse.rest.media.v1.media_storage import ReadableFileWrapper
|
||||||
from synapse.spam_checker_api import RegistrationBehaviour
|
from synapse.spam_checker_api import RegistrationBehaviour
|
||||||
from synapse.types import Collection
|
from synapse.types import Collection
|
||||||
from synapse.util.async_helpers import maybe_awaitable
|
from synapse.util.async_helpers import maybe_awaitable
|
||||||
|
@ -214,3 +216,48 @@ class SpamChecker:
|
||||||
return behaviour
|
return behaviour
|
||||||
|
|
||||||
return RegistrationBehaviour.ALLOW
|
return RegistrationBehaviour.ALLOW
|
||||||
|
|
||||||
|
async def check_media_file_for_spam(
|
||||||
|
self, file_wrapper: ReadableFileWrapper, file_info: FileInfo
|
||||||
|
) -> bool:
|
||||||
|
"""Checks if a piece of newly uploaded media should be blocked.
|
||||||
|
|
||||||
|
This will be called for local uploads, downloads of remote media, each
|
||||||
|
thumbnail generated for those, and web pages/images used for URL
|
||||||
|
previews.
|
||||||
|
|
||||||
|
Note that care should be taken to not do blocking IO operations in the
|
||||||
|
main thread. For example, to get the contents of a file a module
|
||||||
|
should do::
|
||||||
|
|
||||||
|
async def check_media_file_for_spam(
|
||||||
|
self, file: ReadableFileWrapper, file_info: FileInfo
|
||||||
|
) -> bool:
|
||||||
|
buffer = BytesIO()
|
||||||
|
await file.write_chunks_to(buffer.write)
|
||||||
|
|
||||||
|
if buffer.getvalue() == b"Hello World":
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file: An object that allows reading the contents of the media.
|
||||||
|
file_info: Metadata about the file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the media should be blocked or False if it should be
|
||||||
|
allowed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
for spam_checker in self.spam_checkers:
|
||||||
|
# For backwards compatibility, only run if the method exists on the
|
||||||
|
# spam checker
|
||||||
|
checker = getattr(spam_checker, "check_media_file_for_spam", None)
|
||||||
|
if checker:
|
||||||
|
spam = await maybe_awaitable(checker(file_wrapper, file_info))
|
||||||
|
if spam:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
|
@ -16,13 +16,17 @@ import contextlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
from typing import IO, TYPE_CHECKING, Any, Optional, Sequence
|
from typing import IO, TYPE_CHECKING, Any, Callable, Optional, Sequence
|
||||||
|
|
||||||
|
import attr
|
||||||
|
|
||||||
from twisted.internet.defer import Deferred
|
from twisted.internet.defer import Deferred
|
||||||
from twisted.internet.interfaces import IConsumer
|
from twisted.internet.interfaces import IConsumer
|
||||||
from twisted.protocols.basic import FileSender
|
from twisted.protocols.basic import FileSender
|
||||||
|
|
||||||
|
from synapse.api.errors import NotFoundError
|
||||||
from synapse.logging.context import defer_to_thread, make_deferred_yieldable
|
from synapse.logging.context import defer_to_thread, make_deferred_yieldable
|
||||||
|
from synapse.util import Clock
|
||||||
from synapse.util.file_consumer import BackgroundFileConsumer
|
from synapse.util.file_consumer import BackgroundFileConsumer
|
||||||
|
|
||||||
from ._base import FileInfo, Responder
|
from ._base import FileInfo, Responder
|
||||||
|
@ -58,6 +62,8 @@ class MediaStorage:
|
||||||
self.local_media_directory = local_media_directory
|
self.local_media_directory = local_media_directory
|
||||||
self.filepaths = filepaths
|
self.filepaths = filepaths
|
||||||
self.storage_providers = storage_providers
|
self.storage_providers = storage_providers
|
||||||
|
self.spam_checker = hs.get_spam_checker()
|
||||||
|
self.clock = hs.get_clock()
|
||||||
|
|
||||||
async def store_file(self, source: IO, file_info: FileInfo) -> str:
|
async def store_file(self, source: IO, file_info: FileInfo) -> str:
|
||||||
"""Write `source` to the on disk media store, and also any other
|
"""Write `source` to the on disk media store, and also any other
|
||||||
|
@ -127,18 +133,29 @@ class MediaStorage:
|
||||||
f.flush()
|
f.flush()
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
spam = await self.spam_checker.check_media_file_for_spam(
|
||||||
|
ReadableFileWrapper(self.clock, fname), file_info
|
||||||
|
)
|
||||||
|
if spam:
|
||||||
|
logger.info("Blocking media due to spam checker")
|
||||||
|
# Note that we'll delete the stored media, due to the
|
||||||
|
# try/except below. The media also won't be stored in
|
||||||
|
# the DB.
|
||||||
|
raise SpamMediaException()
|
||||||
|
|
||||||
for provider in self.storage_providers:
|
for provider in self.storage_providers:
|
||||||
await provider.store_file(path, file_info)
|
await provider.store_file(path, file_info)
|
||||||
|
|
||||||
finished_called[0] = True
|
finished_called[0] = True
|
||||||
|
|
||||||
yield f, fname, finish
|
yield f, fname, finish
|
||||||
except Exception:
|
except Exception as e:
|
||||||
try:
|
try:
|
||||||
os.remove(fname)
|
os.remove(fname)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
raise
|
|
||||||
|
raise e from None
|
||||||
|
|
||||||
if not finished_called:
|
if not finished_called:
|
||||||
raise Exception("Finished callback not called")
|
raise Exception("Finished callback not called")
|
||||||
|
@ -302,3 +319,39 @@ class FileResponder(Responder):
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
self.open_file.close()
|
self.open_file.close()
|
||||||
|
|
||||||
|
|
||||||
|
class SpamMediaException(NotFoundError):
|
||||||
|
"""The media was blocked by a spam checker, so we simply 404 the request (in
|
||||||
|
the same way as if it was quarantined).
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@attr.s(slots=True)
|
||||||
|
class ReadableFileWrapper:
|
||||||
|
"""Wrapper that allows reading a file in chunks, yielding to the reactor,
|
||||||
|
and writing to a callback.
|
||||||
|
|
||||||
|
This is simplified `FileSender` that takes an IO object rather than an
|
||||||
|
`IConsumer`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
CHUNK_SIZE = 2 ** 14
|
||||||
|
|
||||||
|
clock = attr.ib(type=Clock)
|
||||||
|
path = attr.ib(type=str)
|
||||||
|
|
||||||
|
async def write_chunks_to(self, callback: Callable[[bytes], None]):
|
||||||
|
"""Reads the file in chunks and calls the callback with each chunk.
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open(self.path, "rb") as file:
|
||||||
|
while True:
|
||||||
|
chunk = file.read(self.CHUNK_SIZE)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
|
||||||
|
callback(chunk)
|
||||||
|
|
||||||
|
# We yield to the reactor by sleeping for 0 seconds.
|
||||||
|
await self.clock.sleep(0)
|
||||||
|
|
|
@ -22,6 +22,7 @@ from twisted.web.http import Request
|
||||||
from synapse.api.errors import Codes, SynapseError
|
from synapse.api.errors import Codes, SynapseError
|
||||||
from synapse.http.server import DirectServeJsonResource, respond_with_json
|
from synapse.http.server import DirectServeJsonResource, respond_with_json
|
||||||
from synapse.http.servlet import parse_string
|
from synapse.http.servlet import parse_string
|
||||||
|
from synapse.rest.media.v1.media_storage import SpamMediaException
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from synapse.app.homeserver import HomeServer
|
from synapse.app.homeserver import HomeServer
|
||||||
|
@ -86,9 +87,14 @@ class UploadResource(DirectServeJsonResource):
|
||||||
# disposition = headers.getRawHeaders(b"Content-Disposition")[0]
|
# disposition = headers.getRawHeaders(b"Content-Disposition")[0]
|
||||||
# TODO(markjh): parse content-dispostion
|
# TODO(markjh): parse content-dispostion
|
||||||
|
|
||||||
content_uri = await self.media_repo.create_content(
|
try:
|
||||||
media_type, upload_name, request.content, content_length, requester.user
|
content_uri = await self.media_repo.create_content(
|
||||||
)
|
media_type, upload_name, request.content, content_length, requester.user
|
||||||
|
)
|
||||||
|
except SpamMediaException:
|
||||||
|
# For uploading of media we want to respond with a 400, instead of
|
||||||
|
# the default 404, as that would just be confusing.
|
||||||
|
raise SynapseError(400, "Bad content")
|
||||||
|
|
||||||
logger.info("Uploaded content with URI %r", content_uri)
|
logger.info("Uploaded content with URI %r", content_uri)
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,8 @@ from twisted.internet import defer
|
||||||
from twisted.internet.defer import Deferred
|
from twisted.internet.defer import Deferred
|
||||||
|
|
||||||
from synapse.logging.context import make_deferred_yieldable
|
from synapse.logging.context import make_deferred_yieldable
|
||||||
|
from synapse.rest import admin
|
||||||
|
from synapse.rest.client.v1 import login
|
||||||
from synapse.rest.media.v1._base import FileInfo
|
from synapse.rest.media.v1._base import FileInfo
|
||||||
from synapse.rest.media.v1.filepath import MediaFilePaths
|
from synapse.rest.media.v1.filepath import MediaFilePaths
|
||||||
from synapse.rest.media.v1.media_storage import MediaStorage
|
from synapse.rest.media.v1.media_storage import MediaStorage
|
||||||
|
@ -37,6 +39,7 @@ from synapse.rest.media.v1.storage_provider import FileStorageProviderBackend
|
||||||
|
|
||||||
from tests import unittest
|
from tests import unittest
|
||||||
from tests.server import FakeSite, make_request
|
from tests.server import FakeSite, make_request
|
||||||
|
from tests.utils import default_config
|
||||||
|
|
||||||
|
|
||||||
class MediaStorageTests(unittest.HomeserverTestCase):
|
class MediaStorageTests(unittest.HomeserverTestCase):
|
||||||
|
@ -398,3 +401,94 @@ class MediaRepoTests(unittest.HomeserverTestCase):
|
||||||
headers.getRawHeaders(b"X-Robots-Tag"),
|
headers.getRawHeaders(b"X-Robots-Tag"),
|
||||||
[b"noindex, nofollow, noarchive, noimageindex"],
|
[b"noindex, nofollow, noarchive, noimageindex"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSpamChecker:
|
||||||
|
"""A spam checker module that rejects all media that includes the bytes
|
||||||
|
`evil`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config, api):
|
||||||
|
self.config = config
|
||||||
|
self.api = api
|
||||||
|
|
||||||
|
def parse_config(config):
|
||||||
|
return config
|
||||||
|
|
||||||
|
async def check_event_for_spam(self, foo):
|
||||||
|
return False # allow all events
|
||||||
|
|
||||||
|
async def user_may_invite(self, inviter_userid, invitee_userid, room_id):
|
||||||
|
return True # allow all invites
|
||||||
|
|
||||||
|
async def user_may_create_room(self, userid):
|
||||||
|
return True # allow all room creations
|
||||||
|
|
||||||
|
async def user_may_create_room_alias(self, userid, room_alias):
|
||||||
|
return True # allow all room aliases
|
||||||
|
|
||||||
|
async def user_may_publish_room(self, userid, room_id):
|
||||||
|
return True # allow publishing of all rooms
|
||||||
|
|
||||||
|
async def check_media_file_for_spam(self, file_wrapper, file_info) -> bool:
|
||||||
|
buf = BytesIO()
|
||||||
|
await file_wrapper.write_chunks_to(buf.write)
|
||||||
|
|
||||||
|
return b"evil" in buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
class SpamCheckerTestCase(unittest.HomeserverTestCase):
|
||||||
|
servlets = [
|
||||||
|
login.register_servlets,
|
||||||
|
admin.register_servlets,
|
||||||
|
]
|
||||||
|
|
||||||
|
def prepare(self, reactor, clock, hs):
|
||||||
|
self.user = self.register_user("user", "pass")
|
||||||
|
self.tok = self.login("user", "pass")
|
||||||
|
|
||||||
|
# Allow for uploading and downloading to/from the media repo
|
||||||
|
self.media_repo = hs.get_media_repository_resource()
|
||||||
|
self.download_resource = self.media_repo.children[b"download"]
|
||||||
|
self.upload_resource = self.media_repo.children[b"upload"]
|
||||||
|
|
||||||
|
def default_config(self):
|
||||||
|
config = default_config("test")
|
||||||
|
|
||||||
|
config.update(
|
||||||
|
{
|
||||||
|
"spam_checker": [
|
||||||
|
{
|
||||||
|
"module": TestSpamChecker.__module__ + ".TestSpamChecker",
|
||||||
|
"config": {},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
def test_upload_innocent(self):
|
||||||
|
"""Attempt to upload some innocent data that should be allowed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
image_data = unhexlify(
|
||||||
|
b"89504e470d0a1a0a0000000d4948445200000001000000010806"
|
||||||
|
b"0000001f15c4890000000a49444154789c63000100000500010d"
|
||||||
|
b"0a2db40000000049454e44ae426082"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.helper.upload_media(
|
||||||
|
self.upload_resource, image_data, tok=self.tok, expect_code=200
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_upload_ban(self):
|
||||||
|
"""Attempt to upload some data that includes bytes "evil", which should
|
||||||
|
get rejected by the spam checker.
|
||||||
|
"""
|
||||||
|
|
||||||
|
data = b"Some evil data"
|
||||||
|
|
||||||
|
self.helper.upload_media(
|
||||||
|
self.upload_resource, data, tok=self.tok, expect_code=400
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in New Issue