Skip to content

Commit 093518e

Browse files
authored
Merge pull request #65 from vkrithika25/main
#62 Added crosswalk table + unit tests
2 parents 50dc096 + 35264d6 commit 093518e

2 files changed

Lines changed: 149 additions & 0 deletions

File tree

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""
2+
MIT License
3+
4+
Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk).
5+
Project: Harmony (https://harmonydata.ac.uk)
6+
Maintainer: Thomas Wood (https://fastdatascience.com)
7+
8+
Permission is hereby granted, free of charge, to any person obtaining a copy
9+
of this software and associated documentation files (the "Software"), to deal
10+
in the Software without restriction, including without limitation the rights
11+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12+
copies of the Software, and to permit persons to whom the Software is
13+
furnished to do so, subject to the following conditions:
14+
15+
The above copyright notice and this permission notice shall be included in all
16+
copies or substantial portions of the Software.
17+
18+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24+
SOFTWARE.
25+
"""
26+
27+
import pandas as pd
28+
29+
def generate_crosswalk_table(all_questions, similarity, threshold):
30+
matching_pairs = []
31+
32+
# iterate through all pairs of questions
33+
for i, q1 in enumerate(all_questions):
34+
for j, q2 in enumerate(all_questions):
35+
# check for non-dupe and similarity above inputted threshold
36+
if j > i and similarity[i, j] > threshold:
37+
# add to list of matches
38+
matching_pairs.append({
39+
'pair_name': f"{i}_{j}",
40+
'question1_no': q1.question_no,
41+
'question1_text': q1.question_text,
42+
'question2_no': q2.question_no,
43+
'question2_text': q2.question_text,
44+
'match_score': similarity[i, j]
45+
})
46+
47+
# convert list to dataframe
48+
return pd.DataFrame(matching_pairs)

tests/test_crosswalk.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
"""
2+
MIT License
3+
4+
Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk).
5+
Project: Harmony (https://harmonydata.ac.uk)
6+
Maintainer: Thomas Wood (https://fastdatascience.com)
7+
8+
Permission is hereby granted, free of charge, to any person obtaining a copy
9+
of this software and associated documentation files (the "Software"), to deal
10+
in the Software without restriction, including without limitation the rights
11+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12+
copies of the Software, and to permit persons to whom the Software is
13+
furnished to do so, subject to the following conditions:
14+
15+
The above copyright notice and this permission notice shall be included in all
16+
copies or substantial portions of the Software.
17+
18+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24+
SOFTWARE.
25+
26+
"""
27+
28+
import sys
29+
import unittest
30+
import pandas as pd
31+
import numpy as np
32+
33+
sys.path.append("../src")
34+
35+
from harmony.matching.generate_crosswalk_table import generate_crosswalk_table
36+
from harmony import match_instruments
37+
from harmony.schemas.requests.text import Instrument, Question
38+
39+
class TestGenerateCrosswalkTable(unittest.TestCase):
40+
def setUp(self):
41+
# Sample data
42+
self.all_questions_dummy = [
43+
Question(question_no="1", question_text="potato"),
44+
Question(question_no="2", question_text="tomato"),
45+
Question(question_no="3", question_text="radish"),
46+
]
47+
48+
self.instruments_dummy = Instrument(questions=self.all_questions_dummy)
49+
50+
self.similarity = np.array([
51+
[1.0, 0.7, 0.9],
52+
[0.7, 1.0, 0.8],
53+
[0.9, 0.8, 1.0]
54+
])
55+
self.all_questions_real = [Question(question_no="1", question_text="Feeling nervous, anxious, or on edge"),
56+
Question(question_no="2", question_text="Not being able to stop or control worrying")]
57+
self.instruments = Instrument(questions=self.all_questions_real)
58+
59+
self.threshold = 0.6
60+
61+
62+
def test_generate_crosswalk_table_dummy_data(self):
63+
result = generate_crosswalk_table(self.instruments_dummy.questions, self.similarity, self.threshold)
64+
65+
expected_matches = [
66+
{"pair_name": "0_1", "question1_no": "1", "question1_text": "potato",
67+
"question2_no": "2", "question2_text": "tomato", "match_score": 0.7},
68+
{"pair_name": "0_2", "question1_no": "1", "question1_text": "potato",
69+
"question2_no": "3", "question2_text": "radish", "match_score": 0.9},
70+
{"pair_name": "1_2", "question1_no": "2", "question1_text": "tomato",
71+
"question2_no": "3", "question2_text": "radish", "match_score": 0.8},
72+
]
73+
74+
for _, row in pd.DataFrame(expected_matches).iterrows():
75+
self.assertTrue(any(row.equals(result_row) for _, result_row in result.iterrows()))
76+
77+
self.assertEqual(len(result), len(expected_matches))
78+
79+
def test_generate_crosswalk_table_empty(self):
80+
empty_similarity = np.eye(3) # Identity matrix, no matches above threshold
81+
result = generate_crosswalk_table(self.all_questions_dummy, empty_similarity, self.threshold)
82+
self.assertTrue(result.empty)
83+
84+
def test_generate_crosswalk_table_real(self):
85+
all_questions, similarity_with_polarity, _, _ = match_instruments([self.instruments])
86+
result = generate_crosswalk_table(all_questions, similarity_with_polarity, self.threshold)
87+
expected_matches = []
88+
89+
for _, row in pd.DataFrame(expected_matches).iterrows():
90+
self.assertTrue(any(row.equals(result_row) for _, result_row in result.iterrows()))
91+
92+
self.assertEqual(len(result), len(expected_matches))
93+
94+
lower_threshold = 0.5
95+
result = generate_crosswalk_table(all_questions, similarity_with_polarity, lower_threshold)
96+
97+
self.assertEqual(len(result), 1)
98+
99+
100+
if __name__ == '__main__':
101+
unittest.main()

0 commit comments

Comments
 (0)