From 7b1861a58da2b7318e43876f9721d6830cc8b0f2 Mon Sep 17 00:00:00 2001
From: Stefan Pratter <stefan@20c.com>
Date: Thu, 2 May 2019 15:20:20 +0000
Subject: [PATCH] un-accent search on quick search and api filters (#310)

---
 Pipfile                                       |  8 +----
 .../management/commands/pdb_api_test.py       | 29 ++++++++++++++++
 peeringdb_server/rest.py                      |  5 +++
 peeringdb_server/search.py                    | 12 ++++---
 requirements.txt                              |  1 +
 tests/test_search.py                          | 33 +++++++++++++++++--
 6 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/Pipfile b/Pipfile
index 968e0d44..0decc316 100644
--- a/Pipfile
+++ b/Pipfile
@@ -1,12 +1,9 @@
 [[source]]
-
 name = "pypi"
 url = "https://pypi.org/simple"
 verify_ssl = true
 
-
 [dev-packages]
-
 pytest = ">=2.8.7"
 pytest-cov = ">=2.0.0"
 pytest-django = ">=2.9.1"
@@ -14,9 +11,7 @@ pytest-filedata = ">=0.1.0"
 jsonschema = ">=2.6.0"
 facsimile = ">=1.1.1"
 
-
 [packages]
-
 certifi = "==2017.11.5"
 ipaddress = "==1.0.19"
 mysqlclient = "==1.3.9"
@@ -56,8 +51,7 @@ django = "==1.11.20"
 uwsgi = "==2.0.14"
 markdown = "==2.6.7"
 "twentyc.rpc" = "==0.3.5"
-
+unidecode = "==1.0.23"
 
 [requires]
-
 python_version = "2.7"
diff --git a/peeringdb_server/management/commands/pdb_api_test.py b/peeringdb_server/management/commands/pdb_api_test.py
index 6b7fabf7..d372c3c6 100644
--- a/peeringdb_server/management/commands/pdb_api_test.py
+++ b/peeringdb_server/management/commands/pdb_api_test.py
@@ -1,4 +1,5 @@
 #!/bin/env python
+# -*- coding: utf-8 -*-
 """
 series of integration/unit tests for the pdb api
 """
@@ -2113,6 +2114,34 @@ class TestJSON(unittest.TestCase):
 
         self.assertEqual(target, comp)
 
+    ##########################################################################
+
+    def test_guest_005_list_filter_accented(self):
+
+        """
+        test filtering with accented search terms
+        """
+
+        #TODO: sqlite3 is being used as the testing backend, and django 1.11
+        #seems to be unable to set a collation on it, so we can't properly test
+        #the other way atm, for now this test at least confirms that the term is
+        #unaccented correctly.
+        #
+        #on production we run mysql with flattened accents so both ways should work
+        #there regardless.
+
+        org = Organization.objects.create(name="org unaccented", status="ok")
+        net = Network.objects.create(asn=12345, name=u"net unaccented",
+                                     status="ok", org=org)
+        ix = InternetExchange.objects.create(org=org, name=u"ix unaccented", status="ok")
+        fac = Facility.objects.create(org=org, name=u"fac unaccented", status="ok")
+
+        for tag in ["org","net","ix","fac"]:
+            data = self.db_guest.all(tag, name=u"{} unãccented".format(tag))
+            self.assertEqual(len(data), 1)
+
+
+
     ##########################################################################
     # READONLY PERMISSION TESTS
     # These tests assert that the readonly users cannot write anything
diff --git a/peeringdb_server/rest.py b/peeringdb_server/rest.py
index 538a346d..6a1d6dcf 100644
--- a/peeringdb_server/rest.py
+++ b/peeringdb_server/rest.py
@@ -1,4 +1,7 @@
 import importlib
+
+import unidecode
+
 from rest_framework import (routers, serializers, status, viewsets)
 from rest_framework.response import Response
 from rest_framework.views import exception_handler
@@ -286,6 +289,8 @@ class ModelViewSet(viewsets.ModelViewSet):
         filters = {}
         for k, v in self.request.query_params.items():
 
+            v = unidecode.unidecode(v)
+
             if k[-3:] == "_id" and k not in field_names:
                 k = k[:-3]
 
diff --git a/peeringdb_server/search.py b/peeringdb_server/search.py
index 00c192d1..b43d4708 100644
--- a/peeringdb_server/search.py
+++ b/peeringdb_server/search.py
@@ -5,6 +5,10 @@ from peeringdb_server.models import (UTC, InternetExchange, Network, Facility)
 import re
 import time
 import datetime
+import unidecode
+
+def unaccent(v):
+    return unidecode.unidecode(v).lower()
 
 # SEARCH INDEX BE STORED HERE
 
@@ -130,7 +134,7 @@ def search(term):
 
     result = {tag: [] for tag, model in ref_dict.items()}
 
-    term = term.lower()
+    term = unaccent(term)
 
     # try to convert to int for numeric search matching
     typed_q = {}
@@ -153,7 +157,7 @@ def search(term):
     # rid of all the ifs
     for tag, index in search_index.items():
         for id, data in index.items():
-            if data.name.lower().find(term) > -1:
+            if unaccent(data.name).find(term) > -1:
                 result[tag].append({
                     "id": id,
                     "name": data.search_result_name,
@@ -162,7 +166,7 @@ def search(term):
                 continue
 
             if hasattr(data,
-                       'name_long') and data.name_long.lower().find(term) > -1:
+                       'name_long') and unaccent(data.name_long).find(term) > -1:
                 result[tag].append({
                     "id": id,
                     "name": data.search_result_name,
@@ -170,7 +174,7 @@ def search(term):
                 })
                 continue
 
-            if hasattr(data, 'aka') and data.aka.lower().find(term) > -1:
+            if hasattr(data, 'aka') and unaccent(data.aka).find(term) > -1:
                 result[tag].append({
                     "id": id,
                     "name": data.search_result_name,
diff --git a/requirements.txt b/requirements.txt
index d6fa4186..1a1ff469 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ Markdown==2.6.7
 bleach==2.1.3
 coreapi==2.3.1
 googlemaps==2.5.1
+Unidecode==1.0.23
 
 django-allauth==0.32.0
 django-autocomplete-light==3.2.9
diff --git a/tests/test_search.py b/tests/test_search.py
index 5685cd0c..9ee3bbd4 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 """
 Unit-tests for quick search functionality - note that advanced search is not
 tested here as that is using the PDB API entirely.
@@ -6,6 +7,7 @@ import re
 import datetime
 
 import pytest
+import unidecode
 
 from django.test import TestCase, RequestFactory
 
@@ -27,20 +29,26 @@ class SearchTests(TestCase):
         search.SEARCH_CACHE["search_index"] = {}
 
         cls.instances = {}
+        cls.instances_accented = {}
         cls.instances_sponsored = {}
 
         # create an instance of each searchable model, so we have something
         # to search for
         cls.org = models.Organization.objects.create(name="Test org")
         for model in search.searchable_models:
+            kwargs = {}
             if model.handleref.tag == "net":
                 kwargs = {"asn": 1}
-            else:
-                kwargs = {}
             cls.instances[model.handleref.tag] = model.objects.create(
                 status="ok", org=cls.org, name="Test %s" % model.handleref.tag,
                 **kwargs)
 
+            if model.handleref.tag == "net":
+                kwargs = {"asn": 2}
+            cls.instances_accented[model.handleref.tag] = model.objects.create(
+                status="ok", org=cls.org,
+                name=u"ãccented {}".format(model.handleref.tag), **kwargs)
+
         # we also need to test that sponsor ship status comes through
         # accordingly
         cls.org_w_sponsorship = models.Organization.objects.create(name="Sponsor org", status="ok")
@@ -52,7 +60,7 @@ class SearchTests(TestCase):
 
         for model in search.searchable_models:
             if model.handleref.tag == "net":
-                kwargs = {"asn": 2}
+                kwargs = {"asn": 3}
             else:
                 kwargs = {}
             cls.instances_sponsored[model.handleref.tag] = model.objects.create(
@@ -140,3 +148,22 @@ class SearchTests(TestCase):
         new_ix_p.delete()
         new_ix_o.delete()
         self.test_search()
+
+    def test_search_unaccent(self):
+        """
+        search for entities containing 'ãccented' using accented and unaccented
+        terms
+        """
+        rv = search.search(u"accented")
+        for k, inst in self.instances_accented.items():
+            assert k in rv
+            assert len(rv[k]) == 1
+            assert unidecode.unidecode(rv[k][0]["name"]) == unidecode.unidecode(inst.search_result_name)
+
+        rv = search.search(u"ãccented")
+        for k, inst in self.instances_accented.items():
+            assert k in rv
+            assert len(rv[k]) == 1
+            assert unidecode.unidecode(rv[k][0]["name"]) == unidecode.unidecode(inst.search_result_name)
+
+