align words with no refs with themselves

zfletch · zfletch · commit aa6f077dda6b · 2023-03-12T10:27:07.000-04:00
diff --git a/example/src/App/App.js b/example/src/App/App.js
@@ -5,11 +5,11 @@ import 'alignment-react/dist/index.css';
 
 const xml = `
 <aligned-text xmlns="http://alpheios.net/namespaces/aligned-text">
-  <language lnum="en" xml:lang="en" dir="ltr"/>
-  <language lnum="fr" xml:lang="fr" dir="ltr"/>
+  <language lnum="L1" xml:lang="eng" dir="ltr"/>
+  <language lnum="L2" xml:lang="fre" dir="ltr"/>
   <comment class="title">alignment</comment>
   <sentence id="1" document_id="">
-    <wds lnum="en">
+    <wds lnum="L1">
       <comment class="uri"/>
       <w n="1-1">
         <text>hello</text>
@@ -20,7 +20,7 @@ const xml = `
         <refs nrefs="1-2 1-3"/>
       </w>
     </wds>
-    <wds lnum="fr">
+    <wds lnum="L2">
       <comment class="uri"/>
       <w n="1-1">
         <text>bonjour</text>
@@ -42,8 +42,8 @@ const xml = `
 const App = () => (
   <Alignment alignment={xml}>
     <Sentence id="1">
-      <Segment lnum="en" />
-      <Segment lnum="fr" />
+      <Segment lnum="L1" />
+      <Segment lnum="L2" />
     </Sentence>
   </Alignment>
 );
diff --git a/src/components/Alignment/Sentence/Sentence.js b/src/components/Alignment/Sentence/Sentence.js
@@ -8,11 +8,11 @@ const uniq = (array) => Array.from(new Set(array));
 // The XML structure used by Perseids for alignments includes some gotchas:
 //   * Word IDs are not globally unique
 //   * It is possible for A to be aligned with B without B being aligned with A
-//   * References do not have a field for `lnum`, meaning that alignments with more
+//   * References do not have a field for lnum, meaning that alignments with more
 //     than one language are impossible
 // Future versions of Perseids should improve on the format, but this application needs
 // to be backwards compatible. This function converts the XML into an object that looks
-// like this (assuming languages have `lnum`s of `L1` and `L2`:
+// like this (assuming languages have lnums of L1 and L2):
 // {
 //   L1: {
 //     '1-1': {
@@ -48,6 +48,13 @@ const buildIdMap = (alignedText, sentence, id) => {
     idMap[outerLnum][outerN][innerLnum].add(innerN);
   };
 
+  // This is the hairiest part of the algorithm. But I'm not sure if there's a better way
+  // to do things. When a word in L1 aligns with a word in L2, we add all members from
+  // the L1 set to the L2 set. (At this time, JavaScript has no Set unify operation.)
+  // We then actually set the pointer for L1 and L2 to the *same Set*. This means
+  // that any future additions affect both L1 and L2. The reason we do this is to account
+  // for "sibling" words: two words with the same lnum that align to the same word in
+  // the other lnum.
   const unifySets = (lnum1, n1, lnum2, n2) => {
     idMap[lnum1][n1][lnum1].forEach((v) => idMap[lnum2][n2][lnum1].add(v));
     idMap[lnum1][n1][lnum2].forEach((v) => idMap[lnum2][n2][lnum2].add(v));
@@ -56,19 +63,19 @@ const buildIdMap = (alignedText, sentence, id) => {
     idMap[lnum2][n2][lnum2] = idMap[lnum1][n1][lnum2];
   };
 
-  sentence.wds.forEach(wd => {
+  (sentence.wds || []).forEach(wd => {
     const lnum = wd.$.lnum;
 
     wd.w.forEach(word => {
       const n = word.$.n;
+      // Align every word with itself
+      addToSet(lnum, n, lnum, n);
 
       if (word.refs) {
         const nrefs = word.refs[0].$.nrefs.split(/\s+/);
 
         lnums.forEach(lnumRef => {
           if (lnumRef !== lnum) {
-            addToSet(lnum, n, lnum, n);
-
             nrefs.forEach(nref => {
               addToSet(lnum, n, lnumRef, nref);
               addToSet(lnumRef, nref, lnum, n);
@@ -90,7 +97,9 @@ const WrappedSentence = ({ id, json, children }) => {
 
   useEffect(() => {
     const alignedText = json['aligned-text'];
-    const sentence = alignedText.sentence.find(({ $: { id: sentenceId }}) => sentenceId === id);
+    const sentence = alignedText
+      ? alignedText.sentence.find(({ $: { id: sentenceId }}) => sentenceId === id)
+      : {};
 
     setSentence(sentence);
     setIdMap(buildIdMap(alignedText, sentence));